func hasPodAffinityConstraints(pod *api.Pod) bool { affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil || affinity == nil { return false } return affinity.PodAffinity != nil || affinity.PodAntiAffinity != nil }
func (c *PodAffinityChecker) getMatchingAntiAffinityTerms(pod *api.Pod, allPods []*api.Pod) ([]matchingPodAntiAffinityTerm, error) { var result []matchingPodAntiAffinityTerm for _, existingPod := range allPods { affinity, err := api.GetAffinityFromPodAnnotations(existingPod.Annotations) if err != nil { return nil, err } if affinity != nil && affinity.PodAntiAffinity != nil { existingPodNode, err := c.info.GetNodeInfo(existingPod.Spec.NodeName) if err != nil { return nil, err } for _, term := range getPodAntiAffinityTerms(affinity.PodAntiAffinity) { match, err := priorityutil.PodMatchesTermsNamespaceAndSelector(pod, existingPod, &term) if err != nil { return nil, err } if match { result = append(result, matchingPodAntiAffinityTerm{term: &term, node: existingPodNode}) } } } } return result, nil }
// Admit will deny any pod that defines AntiAffinity topology key other than metav1.LabelHostname i.e. "kubernetes.io/hostname" // in requiredDuringSchedulingRequiredDuringExecution and requiredDuringSchedulingIgnoredDuringExecution. func (p *plugin) Admit(attributes admission.Attributes) (err error) { // Ignore all calls to subresources or resources other than pods. if len(attributes.GetSubresource()) != 0 || attributes.GetResource().GroupResource() != api.Resource("pods") { return nil } pod, ok := attributes.GetObject().(*api.Pod) if !ok { return apierrors.NewBadRequest("Resource was marked with kind Pod but was unable to be converted") } affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { glog.V(5).Infof("Invalid Affinity detected, but we will leave handling of this to validation phase") return nil } if affinity != nil && affinity.PodAntiAffinity != nil { var podAntiAffinityTerms []api.PodAffinityTerm if len(affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 { podAntiAffinityTerms = affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution } // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution. //if len(affinity.PodAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 { // podAntiAffinityTerms = append(podAntiAffinityTerms, affinity.PodAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution...) //} for _, v := range podAntiAffinityTerms { if v.TopologyKey != metav1.LabelHostname { return apierrors.NewForbidden(attributes.GetResource().GroupResource(), pod.Name, fmt.Errorf("affinity.PodAntiAffinity.RequiredDuringScheduling has TopologyKey %v but only key %v is allowed", v.TopologyKey, metav1.LabelHostname)) } } } return nil }
func (checker *PodAffinityChecker) InterPodAffinityMatches(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, error) { node := nodeInfo.Node() if node == nil { return false, fmt.Errorf("node not found") } allPods, err := checker.podLister.List(labels.Everything()) if err != nil { return false, err } affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { return false, err } // Check if the current node match the inter-pod affinity scheduling constraints. // Hard inter-pod affinity is not symmetric, check only when affinity.PodAffinity exists. if affinity.PodAffinity != nil { if !checker.NodeMatchesHardPodAffinity(pod, allPods, node, affinity.PodAffinity) { return false, ErrPodAffinityNotMatch } } // Hard inter-pod anti-affinity is symmetric, we should always check it. if !checker.NodeMatchesHardPodAntiAffinity(pod, allPods, node, affinity.PodAntiAffinity) { return false, ErrPodAffinityNotMatch } return true, nil }
// Admit will deny any pod that defines AntiAffinity topology key other than unversioned.LabelHostname i.e. "kubernetes.io/hostname" // in requiredDuringSchedulingRequiredDuringExecution and requiredDuringSchedulingIgnoredDuringExecution. func (p *plugin) Admit(attributes admission.Attributes) (err error) { if attributes.GetResource().GroupResource() != api.Resource("pods") { return nil } pod, ok := attributes.GetObject().(*api.Pod) if !ok { return apierrors.NewBadRequest("Resource was marked with kind Pod but was unable to be converted") } affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { // this is validated later return nil } if affinity.PodAntiAffinity != nil { var podAntiAffinityTerms []api.PodAffinityTerm if len(affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 { podAntiAffinityTerms = affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution } // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution. //if len(affinity.PodAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 { // podAntiAffinityTerms = append(podAntiAffinityTerms, affinity.PodAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution...) //} for _, v := range podAntiAffinityTerms { if v.TopologyKey != unversioned.LabelHostname { return apierrors.NewForbidden(attributes.GetResource().GroupResource(), pod.Name, fmt.Errorf("affinity.PodAntiAffinity.RequiredDuringScheduling has TopologyKey %v but only key %v is allowed", v.TopologyKey, unversioned.LabelHostname)) } } } return nil }
func (c *PodAffinityChecker) InterPodAffinityMatches(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { node := nodeInfo.Node() if node == nil { return false, nil, fmt.Errorf("node not found") } if !c.satisfiesExistingPodsAntiAffinity(pod, meta, node) { return false, []algorithm.PredicateFailureReason{ErrPodAffinityNotMatch}, nil } // Now check if <pod> requirements will be satisfied on this node. affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { return false, nil, err } if affinity == nil || (affinity.PodAffinity == nil && affinity.PodAntiAffinity == nil) { return true, nil, nil } if !c.satisfiesPodsAffinityAntiAffinity(pod, node, affinity) { return false, []algorithm.PredicateFailureReason{ErrPodAffinityNotMatch}, nil } if glog.V(10) { // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is // not logged. There is visible performance gain from it. glog.Infof("Schedule Pod %+v on Node %+v is allowed, pod (anti)affinity constraints satisfied", podName(pod), node.Name) } return true, nil, nil }
// CalculateNodeAffinityPriority prioritizes nodes according to node affinity scheduling preferences // indicated in PreferredDuringSchedulingIgnoredDuringExecution. Each time a node match a preferredSchedulingTerm, // it will a get an add of preferredSchedulingTerm.Weight. Thus, the more preferredSchedulingTerms // the node satisfies and the more the preferredSchedulingTerm that is satisfied weights, the higher // score the node gets. func CalculateNodeAffinityPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { nodes, err := nodeLister.List() if err != nil { return nil, err } var maxCount float64 counts := make(map[string]float64, len(nodes.Items)) affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { return nil, err } // A nil element of PreferredDuringSchedulingIgnoredDuringExecution matches no objects. // An element of PreferredDuringSchedulingIgnoredDuringExecution that refers to an // empty PreferredSchedulingTerm matches all objects. if affinity.NodeAffinity != nil && affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution != nil { // Match PreferredDuringSchedulingIgnoredDuringExecution term by term. for _, preferredSchedulingTerm := range affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution { if preferredSchedulingTerm.Weight == 0 { continue } nodeSelector, err := api.NodeSelectorRequirementsAsSelector(preferredSchedulingTerm.Preference.MatchExpressions) if err != nil { return nil, err } for _, node := range nodes.Items { if nodeSelector.Matches(labels.Set(node.Labels)) { counts[node.Name] += float64(preferredSchedulingTerm.Weight) } if counts[node.Name] > maxCount { maxCount = counts[node.Name] } } } } result := make(schedulerapi.HostPriorityList, 0, len(nodes.Items)) for i := range nodes.Items { node := &nodes.Items[i] if maxCount > 0 { fScore := 10 * (counts[node.Name] / maxCount) result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) if glog.V(10) { // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is // not logged. There is visible performance gain from it. glog.Infof("%v -> %v: NodeAffinityPriority, Score: (%d)", pod.Name, node.Name, int(fScore)) } } else { result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: 0}) } } return result, nil }
func getMatchingAntiAffinityTerms(pod *api.Pod, nodeInfoMap map[string]*schedulercache.NodeInfo) ([]matchingPodAntiAffinityTerm, error) { allNodeNames := make([]string, 0, len(nodeInfoMap)) for name := range nodeInfoMap { allNodeNames = append(allNodeNames, name) } var lock sync.Mutex var result []matchingPodAntiAffinityTerm var firstError error appendResult := func(toAppend []matchingPodAntiAffinityTerm) { lock.Lock() defer lock.Unlock() result = append(result, toAppend...) } catchError := func(err error) { lock.Lock() defer lock.Unlock() if firstError == nil { firstError = err } } processNode := func(i int) { nodeInfo := nodeInfoMap[allNodeNames[i]] node := nodeInfo.Node() if node == nil { catchError(fmt.Errorf("node not found")) return } var nodeResult []matchingPodAntiAffinityTerm for _, existingPod := range nodeInfo.PodsWithAffinity() { affinity, err := api.GetAffinityFromPodAnnotations(existingPod.Annotations) if err != nil { catchError(err) return } if affinity == nil { continue } for _, term := range getPodAntiAffinityTerms(affinity.PodAntiAffinity) { match, err := priorityutil.PodMatchesTermsNamespaceAndSelector(pod, existingPod, &term) if err != nil { catchError(err) return } if match { nodeResult = append(nodeResult, matchingPodAntiAffinityTerm{term: &term, node: node}) } } } if len(nodeResult) > 0 { appendResult(nodeResult) } } workqueue.Parallelize(16, len(allNodeNames), processNode) return result, firstError }
// CalculateNodeAffinityPriority prioritizes nodes according to node affinity scheduling preferences // indicated in PreferredDuringSchedulingIgnoredDuringExecution. Each time a node match a preferredSchedulingTerm, // it will a get an add of preferredSchedulingTerm.Weight. Thus, the more preferredSchedulingTerms // the node satisfies and the more the preferredSchedulingTerm that is satisfied weights, the higher // score the node gets. func (s *NodeAffinity) CalculateNodeAffinityPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { var maxCount int counts := map[string]int{} nodes, err := nodeLister.List() if err != nil { return nil, err } affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { return nil, err } // A nil element of PreferredDuringSchedulingIgnoredDuringExecution matches no objects. // An element of PreferredDuringSchedulingIgnoredDuringExecution that refers to an // empty PreferredSchedulingTerm matches all objects. if affinity.NodeAffinity != nil && affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution != nil { // Match PreferredDuringSchedulingIgnoredDuringExecution term by term. for _, preferredSchedulingTerm := range affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution { if preferredSchedulingTerm.Weight == 0 { continue } nodeSelector, err := api.NodeSelectorRequirementsAsSelector(preferredSchedulingTerm.Preference.MatchExpressions) if err != nil { return nil, err } for _, node := range nodes.Items { if nodeSelector.Matches(labels.Set(node.Labels)) { counts[node.Name] += preferredSchedulingTerm.Weight } if counts[node.Name] > maxCount { maxCount = counts[node.Name] } } } } result := []schedulerapi.HostPriority{} for _, node := range nodes.Items { fScore := float64(0) if maxCount > 0 { fScore = 10 * (float64(counts[node.Name]) / float64(maxCount)) } result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) glog.V(10).Infof("%v -> %v: NodeAffinityPriority, Score: (%d)", pod.Name, node.Name, int(fScore)) } return result, nil }
// The pod can only schedule onto nodes that satisfy requirements in both NodeAffinity and nodeSelector. func podMatchesNodeLabels(pod *api.Pod, node *api.Node) bool { // Check if node.Labels match pod.Spec.NodeSelector. if len(pod.Spec.NodeSelector) > 0 { selector := labels.SelectorFromSet(pod.Spec.NodeSelector) if !selector.Matches(labels.Set(node.Labels)) { return false } } // Parse required node affinity scheduling requirements // and check if the current node match the requirements. affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { glog.V(10).Infof("Failed to get Affinity from Pod %+v, err: %+v", podName(pod), err) return false } // 1. nil NodeSelector matches all nodes (i.e. does not filter out any nodes) // 2. nil []NodeSelectorTerm (equivalent to non-nil empty NodeSelector) matches no nodes // 3. zero-length non-nil []NodeSelectorTerm matches no nodes also, just for simplicity // 4. nil []NodeSelectorRequirement (equivalent to non-nil empty NodeSelectorTerm) matches no nodes // 5. zero-length non-nil []NodeSelectorRequirement matches no nodes also, just for simplicity // 6. non-nil empty NodeSelectorRequirement is not allowed nodeAffinityMatches := true if affinity.NodeAffinity != nil { nodeAffinity := affinity.NodeAffinity // if no required NodeAffinity requirements, will do no-op, means select all nodes. // TODO: Replace next line with subsequent commented-out line when implement RequiredDuringSchedulingRequiredDuringExecution. if nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil { // if nodeAffinity.RequiredDuringSchedulingRequiredDuringExecution == nil && nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil { return true } // Match node selector for requiredDuringSchedulingRequiredDuringExecution. // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution. // if nodeAffinity.RequiredDuringSchedulingRequiredDuringExecution != nil { // nodeSelectorTerms := nodeAffinity.RequiredDuringSchedulingRequiredDuringExecution.NodeSelectorTerms // glog.V(10).Infof("Match for RequiredDuringSchedulingRequiredDuringExecution node selector terms %+v", nodeSelectorTerms) // nodeAffinityMatches = nodeMatchesNodeSelectorTerms(node, nodeSelectorTerms) // } // Match node selector for requiredDuringSchedulingIgnoredDuringExecution. if nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil { nodeSelectorTerms := nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms glog.V(10).Infof("Match for RequiredDuringSchedulingIgnoredDuringExecution node selector terms %+v", nodeSelectorTerms) nodeAffinityMatches = nodeAffinityMatches && nodeMatchesNodeSelectorTerms(node, nodeSelectorTerms) } } return nodeAffinityMatches }
// Checks whether the given node has pods which satisfy all the // required pod anti-affinity scheduling rules. // Also checks whether putting the pod onto the node would break // any anti-affinity scheduling rules indicated by existing pods. // If node has pods which satisfy all the required pod anti-affinity // scheduling rules and scheduling the pod onto the node won't // break any existing pods' anti-affinity rules, then return true. func (checker *PodAffinityChecker) NodeMatchesHardPodAntiAffinity(pod *api.Pod, allPods []*api.Pod, node *api.Node, podAntiAffinity *api.PodAntiAffinity) bool { // foreach element podAntiAffinityTerm of podAntiAffinityTerms // if the pod matches the term (breaks the anti-affinity), // don't schedule the pod onto this node. for _, podAntiAffinityTerm := range getPodAntiAffinityTerms(podAntiAffinity) { podAntiAffinityTermMatches, _, err := checker.AnyPodMatchesPodAffinityTerm(pod, allPods, node, podAntiAffinityTerm) if err != nil || podAntiAffinityTermMatches { glog.V(10).Infof("Cannot schedule pod %+v onto node %v, because not all the existing pods on this node satisfy the PodAntiAffinityTerm %v, err: %v", podName(pod), node.Name, podAntiAffinityTerm, err) return false } } // Check if scheduling the pod onto this node would break // any anti-affinity rules indicated by the existing pods on the node. // If it would break, system should not schedule pod onto this node. for _, ep := range allPods { epAffinity, err := api.GetAffinityFromPodAnnotations(ep.Annotations) if err != nil { glog.V(10).Infof("Failed to get Affinity from Pod %+v, err: %+v", podName(pod), err) return false } if epAffinity == nil { continue } epNode, err := checker.info.GetNodeInfo(ep.Spec.NodeName) if err != nil { glog.V(10).Infof("Failed to get node from Pod %+v, err: %+v", podName(ep), err) return false } for _, epAntiAffinityTerm := range getPodAntiAffinityTerms(epAffinity.PodAntiAffinity) { match, err := priorityutil.PodMatchesTermsNamespaceAndSelector(pod, ep, &epAntiAffinityTerm) if err != nil { glog.V(10).Infof("Failed to get label selector from anti-affinityterm %+v of existing pod %+v, err: %+v", epAntiAffinityTerm, podName(pod), err) return false } if match && checker.failureDomains.NodesHaveSameTopologyKey(node, epNode, epAntiAffinityTerm.TopologyKey) { glog.V(10).Infof("Cannot schedule Pod %+v, onto node %v because the pod would break the PodAntiAffinityTerm %+v, of existing pod %+v, err: %v", podName(pod), node.Name, epAntiAffinityTerm, podName(ep), err) return false } } } // all the required pod anti-affinity scheduling rules are satisfied glog.V(10).Infof("Can schedule Pod %+v, on node %v because all the required pod anti-affinity scheduling rules are satisfied", podName(pod), node.Name) return true }
// CalculateNodeAffinityPriority prioritizes nodes according to node affinity scheduling preferences // indicated in PreferredDuringSchedulingIgnoredDuringExecution. Each time a node match a preferredSchedulingTerm, // it will a get an add of preferredSchedulingTerm.Weight. Thus, the more preferredSchedulingTerms // the node satisfies and the more the preferredSchedulingTerm that is satisfied weights, the higher // score the node gets. func CalculateNodeAffinityPriorityMap(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) { node := nodeInfo.Node() if node == nil { return schedulerapi.HostPriority{}, fmt.Errorf("node not found") } var affinity *api.Affinity if priorityMeta, ok := meta.(*priorityMetadata); ok { affinity = priorityMeta.affinity } else { // We couldn't parse metadata - fallback to computing it. var err error affinity, err = api.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { return schedulerapi.HostPriority{}, err } } var count int32 // A nil element of PreferredDuringSchedulingIgnoredDuringExecution matches no objects. // An element of PreferredDuringSchedulingIgnoredDuringExecution that refers to an // empty PreferredSchedulingTerm matches all objects. if affinity != nil && affinity.NodeAffinity != nil && affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution != nil { // Match PreferredDuringSchedulingIgnoredDuringExecution term by term. for i := range affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution { preferredSchedulingTerm := &affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution[i] if preferredSchedulingTerm.Weight == 0 { continue } // TODO: Avoid computing it for all nodes if this becomes a performance problem. nodeSelector, err := api.NodeSelectorRequirementsAsSelector(preferredSchedulingTerm.Preference.MatchExpressions) if err != nil { return schedulerapi.HostPriority{}, err } if nodeSelector.Matches(labels.Set(node.Labels)) { count += preferredSchedulingTerm.Weight } } } return schedulerapi.HostPriority{ Host: node.Name, Score: int(count), }, nil }
// NodeMatchPodAffinityAntiAffinity checks if the node matches // the requiredDuringScheduling affinity/anti-affinity rules indicated by the pod. func (checker *PodAffinityChecker) NodeMatchPodAffinityAntiAffinity(pod *api.Pod, allPods []*api.Pod, node *api.Node) bool { // Parse required affinity scheduling rules. affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { glog.V(10).Infof("Failed to get Affinity from Pod %+v, err: %+v", podName(pod), err) return false } // check if the current node match the inter-pod affinity scheduling rules. // hard inter-pod affinity is not symmetric, check only when affinity.PodAffinity is not nil. if affinity.PodAffinity != nil { if !checker.NodeMatchesHardPodAffinity(pod, allPods, node, affinity.PodAffinity) { return false } } // hard inter-pod anti-affinity is symmetric, check both when affinity.PodAntiAffinity is nil and not nil. return checker.NodeMatchesHardPodAntiAffinity(pod, allPods, node, affinity.PodAntiAffinity) }
func PriorityMetadata(pod *api.Pod) interface{} { // If we cannot compute metadata, just return nil if pod == nil { return nil } tolerations, err := getTolerationListFromPod(pod) if err != nil { return nil } affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { return nil } return &priorityMetadata{ nonZeroRequest: getNonZeroRequests(pod), podTolerations: tolerations, affinity: affinity, } }
// Checks whether the given node has pods which satisfy all the // required pod anti-affinity scheduling rules. // Also checks whether putting the pod onto the node would break // any anti-affinity scheduling rules indicated by existing pods. // If node has pods which satisfy all the required pod anti-affinity // scheduling rules and scheduling the pod onto the node won't // break any existing pods' anti-affinity rules, then return true. func (checker *PodAffinityChecker) NodeMatchesHardPodAntiAffinity(pod *api.Pod, allPods []*api.Pod, node *api.Node, podAntiAffinity *api.PodAntiAffinity) bool { var podAntiAffinityTerms []api.PodAffinityTerm if len(podAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 { podAntiAffinityTerms = podAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution } // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution. //if len(podAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 { // podAntiAffinityTerms = append(podAntiAffinityTerms, podAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution...) //} // foreach element podAntiAffinityTerm of podAntiAffinityTerms // if the pod matches the term (breaks the anti-affinity), // don't schedule the pod onto this node. for _, podAntiAffinityTerm := range podAntiAffinityTerms { podAntiAffinityTermMatches, err := checker.AnyPodMatchesPodAffinityTerm(pod, allPods, node, podAntiAffinityTerm) if err != nil || podAntiAffinityTermMatches == true { glog.V(10).Infof("Cannot schedule pod %+v onto node %v, because not all the existing pods on this node satisfy the PodAntiAffinityTerm %v, err: %v", podName(pod), node.Name, podAntiAffinityTerm, err) return false } } // Check if scheduling the pod onto this node would break // any anti-affinity rules indicated by the existing pods on the node. // If it would break, system should not schedule pod onto this node. for _, ep := range allPods { epAffinity, err := api.GetAffinityFromPodAnnotations(ep.Annotations) if err != nil { glog.V(10).Infof("Failed to get Affinity from Pod %+v, err: %+v", podName(pod), err) return false } if epAffinity.PodAntiAffinity != nil { var epAntiAffinityTerms []api.PodAffinityTerm if len(epAffinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 { epAntiAffinityTerms = epAffinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution } // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution. //if len(epAffinity.PodAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 { // epAntiAffinityTerms = append(epAntiAffinityTerms, epAffinity.PodAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution...) //} for _, epAntiAffinityTerm := range epAntiAffinityTerms { labelSelector, err := unversioned.LabelSelectorAsSelector(epAntiAffinityTerm.LabelSelector) if err != nil { glog.V(10).Infof("Failed to get label selector from anti-affinityterm %+v of existing pod %+v, err: %+v", epAntiAffinityTerm, podName(pod), err) return false } names := priorityutil.GetNamespacesFromPodAffinityTerm(ep, epAntiAffinityTerm) if (len(names) == 0 || names.Has(pod.Namespace)) && labelSelector.Matches(labels.Set(pod.Labels)) { epNode, err := checker.info.GetNodeInfo(ep.Spec.NodeName) if err != nil || checker.failureDomains.NodesHaveSameTopologyKey(node, epNode, epAntiAffinityTerm.TopologyKey) { glog.V(10).Infof("Cannot schedule Pod %+v, onto node %v because the pod would break the PodAntiAffinityTerm %+v, of existing pod %+v, err: %v", podName(pod), node.Name, epAntiAffinityTerm, podName(ep), err) return false } } } } } // all the required pod anti-affinity scheduling rules are satisfied glog.V(10).Infof("Can schedule Pod %+v, on node %v because all the required pod anti-affinity scheduling rules are satisfied", podName(pod), node.Name) return true }
// compute a sum by iterating through the elements of weightedPodAffinityTerm and adding // "weight" to the sum if the corresponding PodAffinityTerm is satisfied for // that node; the node(s) with the highest sum are the most preferred. // Symmetry need to be considered for preferredDuringSchedulingIgnoredDuringExecution from podAffinity & podAntiAffinity, // symmetry need to be considered for hard requirements from podAffinity func (ipa *InterPodAffinity) CalculateInterPodAffinityPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { nodes, err := nodeLister.List() if err != nil { return nil, err } allPods, err := ipa.podLister.List(labels.Everything()) if err != nil { return nil, err } affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { return nil, err } // convert the topology key based weights to the node name based weights var maxCount int var minCount int counts := map[string]int{} for _, node := range nodes.Items { totalCount := 0 // count weights for the weighted pod affinity if affinity.PodAffinity != nil { for _, weightedTerm := range affinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution { weightedCount, err := ipa.CountWeightByPodMatchAffinityTerm(pod, allPods, weightedTerm.Weight, weightedTerm.PodAffinityTerm, &node) if err != nil { return nil, err } totalCount += weightedCount } } // count weights for the weighted pod anti-affinity if affinity.PodAntiAffinity != nil { for _, weightedTerm := range affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution { weightedCount, err := ipa.CountWeightByPodMatchAffinityTerm(pod, allPods, (0 - weightedTerm.Weight), weightedTerm.PodAffinityTerm, &node) if err != nil { return nil, err } totalCount += weightedCount } } // reverse direction checking: count weights for the inter-pod affinity/anti-affinity rules // that are indicated by existing pods on the node. for _, ep := range allPods { epAffinity, err := api.GetAffinityFromPodAnnotations(ep.Annotations) if err != nil { return nil, err } if epAffinity.PodAffinity != nil { // count the implicit weight for the hard pod affinity indicated by the existing pod. if ipa.hardPodAffinityWeight > 0 { var podAffinityTerms []api.PodAffinityTerm if len(epAffinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 { podAffinityTerms = epAffinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution } // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution. //if len(affinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 { // podAffinityTerms = append(podAffinityTerms, affinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution...) //} for _, epAffinityTerm := range podAffinityTerms { match, err := ipa.failureDomains.CheckIfPodMatchPodAffinityTerm(pod, ep, epAffinityTerm, func(pod *api.Pod) (*api.Node, error) { return &node, nil }, func(ep *api.Pod) (*api.Node, error) { return ipa.info.GetNodeInfo(ep.Spec.NodeName) }, ) if err != nil { return nil, err } if match { totalCount += ipa.hardPodAffinityWeight } } } // count weight for the weighted pod affinity indicated by the existing pod. for _, epWeightedTerm := range epAffinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution { match, err := ipa.failureDomains.CheckIfPodMatchPodAffinityTerm(pod, ep, epWeightedTerm.PodAffinityTerm, func(pod *api.Pod) (*api.Node, error) { return &node, nil }, func(ep *api.Pod) (*api.Node, error) { return ipa.info.GetNodeInfo(ep.Spec.NodeName) }, ) if err != nil { return nil, err } if match { totalCount += epWeightedTerm.Weight } } } // count weight for the weighted pod anti-affinity indicated by the existing pod. if epAffinity.PodAntiAffinity != nil { for _, epWeightedTerm := range epAffinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution { match, err := ipa.failureDomains.CheckIfPodMatchPodAffinityTerm(pod, ep, epWeightedTerm.PodAffinityTerm, func(pod *api.Pod) (*api.Node, error) { return &node, nil }, func(ep *api.Pod) (*api.Node, error) { return ipa.info.GetNodeInfo(ep.Spec.NodeName) }, ) if err != nil { return nil, err } if match { totalCount -= epWeightedTerm.Weight } } } } counts[node.Name] = totalCount if counts[node.Name] > maxCount { maxCount = counts[node.Name] } if counts[node.Name] < minCount { minCount = counts[node.Name] } } // calculate final priority score for each node result := []schedulerapi.HostPriority{} for _, node := range nodes.Items { fScore := float64(0) if (maxCount - minCount) > 0 { fScore = 10 * (float64(counts[node.Name]-minCount) / float64(maxCount-minCount)) } result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) glog.V(10).Infof( "%v -> %v: InterPodAffinityPriority, Score: (%d)", pod.Name, node.Name, int(fScore), ) } return result, nil }
// compute a sum by iterating through the elements of weightedPodAffinityTerm and adding // "weight" to the sum if the corresponding PodAffinityTerm is satisfied for // that node; the node(s) with the highest sum are the most preferred. // Symmetry need to be considered for preferredDuringSchedulingIgnoredDuringExecution from podAffinity & podAntiAffinity, // symmetry need to be considered for hard requirements from podAffinity func (ipa *InterPodAffinity) CalculateInterPodAffinityPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodes []*api.Node) (schedulerapi.HostPriorityList, error) { affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { return nil, err } hasAffinityConstraints := affinity != nil && affinity.PodAffinity != nil hasAntiAffinityConstraints := affinity != nil && affinity.PodAntiAffinity != nil allNodeNames := make([]string, 0, len(nodeNameToInfo)) for name := range nodeNameToInfo { allNodeNames = append(allNodeNames, name) } // convert the topology key based weights to the node name based weights var maxCount float64 var minCount float64 // priorityMap stores the mapping from node name to so-far computed score of // the node. pm := newPodAffinityPriorityMap(nodes, ipa.failureDomains) processPod := func(existingPod *api.Pod) error { existingPodNode, err := ipa.info.GetNodeInfo(existingPod.Spec.NodeName) if err != nil { return err } existingPodAffinity, err := api.GetAffinityFromPodAnnotations(existingPod.Annotations) if err != nil { return err } existingHasAffinityConstraints := existingPodAffinity != nil && existingPodAffinity.PodAffinity != nil existingHasAntiAffinityConstraints := existingPodAffinity != nil && existingPodAffinity.PodAntiAffinity != nil if hasAffinityConstraints { // For every soft pod affinity term of <pod>, if <existingPod> matches the term, // increment <pm.counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPods>`s node by the term`s weight. terms := affinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution pm.processTerms(terms, pod, existingPod, existingPodNode, 1) } if hasAntiAffinityConstraints { // For every soft pod anti-affinity term of <pod>, if <existingPod> matches the term, // decrement <pm.counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>`s node by the term`s weight. terms := affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution pm.processTerms(terms, pod, existingPod, existingPodNode, -1) } if existingHasAffinityConstraints { // For every hard pod affinity term of <existingPod>, if <pod> matches the term, // increment <pm.counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>'s node by the constant <ipa.hardPodAffinityWeight> if ipa.hardPodAffinityWeight > 0 { terms := existingPodAffinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution. //if len(existingPodAffinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 { // terms = append(terms, existingPodAffinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution...) //} for _, term := range terms { pm.processTerm(&term, existingPod, pod, existingPodNode, float64(ipa.hardPodAffinityWeight)) } } // For every soft pod affinity term of <existingPod>, if <pod> matches the term, // increment <pm.counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>'s node by the term's weight. terms := existingPodAffinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution pm.processTerms(terms, existingPod, pod, existingPodNode, 1) } if existingHasAntiAffinityConstraints { // For every soft pod anti-affinity term of <existingPod>, if <pod> matches the term, // decrement <pm.counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>'s node by the term's weight. terms := existingPodAffinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution pm.processTerms(terms, existingPod, pod, existingPodNode, -1) } return nil } processNode := func(i int) { nodeInfo := nodeNameToInfo[allNodeNames[i]] if hasAffinityConstraints || hasAntiAffinityConstraints { // We need to process all the nodes. for _, existingPod := range nodeInfo.Pods() { if err := processPod(existingPod); err != nil { pm.setError(err) } } } else { // The pod doesn't have any constraints - we need to check only existing // ones that have some. for _, existingPod := range nodeInfo.PodsWithAffinity() { if err := processPod(existingPod); err != nil { pm.setError(err) } } } } workqueue.Parallelize(16, len(allNodeNames), processNode) if pm.firstError != nil { return nil, pm.firstError } for _, node := range nodes { if pm.counts[node.Name] > maxCount { maxCount = pm.counts[node.Name] } if pm.counts[node.Name] < minCount { minCount = pm.counts[node.Name] } } // calculate final priority score for each node result := make(schedulerapi.HostPriorityList, 0, len(nodes)) for _, node := range nodes { fScore := float64(0) if (maxCount - minCount) > 0 { fScore = 10 * ((pm.counts[node.Name] - minCount) / (maxCount - minCount)) } result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) if glog.V(10) { // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is // not logged. There is visible performance gain from it. glog.V(10).Infof("%v -> %v: InterPodAffinityPriority, Score: (%d)", pod.Name, node.Name, int(fScore)) } } return result, nil }
// compute a sum by iterating through the elements of weightedPodAffinityTerm and adding // "weight" to the sum if the corresponding PodAffinityTerm is satisfied for // that node; the node(s) with the highest sum are the most preferred. // Symmetry need to be considered for preferredDuringSchedulingIgnoredDuringExecution from podAffinity & podAntiAffinity, // symmetry need to be considered for hard requirements from podAffinity func (ipa *InterPodAffinity) CalculateInterPodAffinityPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodes []*api.Node) (schedulerapi.HostPriorityList, error) { allPods, err := ipa.podLister.List(labels.Everything()) if err != nil { return nil, err } affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations) if err != nil { return nil, err } // convert the topology key based weights to the node name based weights var maxCount float64 var minCount float64 // counts store the mapping from node name to so-far computed score of // the node. counts := make(map[string]float64, len(nodes)) processTerm := func(term *api.PodAffinityTerm, affinityPod, podToCheck *api.Pod, fixedNode *api.Node, weight float64) error { match, err := podMatchesNamespaceAndSelector(podToCheck, affinityPod, term) if err != nil { return err } if match { for _, node := range nodes { if ipa.failureDomains.NodesHaveSameTopologyKey(node, fixedNode, term.TopologyKey) { counts[node.Name] += weight } } } return nil } processTerms := func(terms []api.WeightedPodAffinityTerm, affinityPod, podToCheck *api.Pod, fixedNode *api.Node, multiplier int) error { for _, weightedTerm := range terms { if err := processTerm(&weightedTerm.PodAffinityTerm, affinityPod, podToCheck, fixedNode, float64(weightedTerm.Weight*multiplier)); err != nil { return err } } return nil } for _, existingPod := range allPods { existingPodNode, err := ipa.info.GetNodeInfo(existingPod.Spec.NodeName) if err != nil { return nil, err } existingPodAffinity, err := api.GetAffinityFromPodAnnotations(existingPod.Annotations) if err != nil { return nil, err } if affinity.PodAffinity != nil { // For every soft pod affinity term of <pod>, if <existingPod> matches the term, // increment <counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPods>`s node by the term`s weight. terms := affinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution if err := processTerms(terms, pod, existingPod, existingPodNode, 1); err != nil { return nil, err } } if affinity.PodAntiAffinity != nil { // For every soft pod anti-affinity term of <pod>, if <existingPod> matches the term, // decrement <counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>`s node by the term`s weight. terms := affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution if err := processTerms(terms, pod, existingPod, existingPodNode, -1); err != nil { return nil, err } } if existingPodAffinity.PodAffinity != nil { // For every hard pod affinity term of <existingPod>, if <pod> matches the term, // increment <counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>'s node by the constant <ipa.hardPodAffinityWeight> if ipa.hardPodAffinityWeight > 0 { terms := existingPodAffinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution. //if len(existingPodAffinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 { // terms = append(terms, existingPodAffinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution...) //} for _, term := range terms { if err := processTerm(&term, existingPod, pod, existingPodNode, float64(ipa.hardPodAffinityWeight)); err != nil { return nil, err } } } // For every soft pod affinity term of <existingPod>, if <pod> matches the term, // increment <counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>'s node by the term's weight. terms := existingPodAffinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution if err := processTerms(terms, existingPod, pod, existingPodNode, 1); err != nil { return nil, err } } if existingPodAffinity.PodAntiAffinity != nil { // For every soft pod anti-affinity term of <existingPod>, if <pod> matches the term, // decrement <counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>'s node by the term's weight. terms := existingPodAffinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution if err := processTerms(terms, existingPod, pod, existingPodNode, -1); err != nil { return nil, err } } } for _, node := range nodes { if counts[node.Name] > maxCount { maxCount = counts[node.Name] } if counts[node.Name] < minCount { minCount = counts[node.Name] } } // calculate final priority score for each node result := make(schedulerapi.HostPriorityList, 0, len(nodes)) for _, node := range nodes { fScore := float64(0) if (maxCount - minCount) > 0 { fScore = 10 * ((counts[node.Name] - minCount) / (maxCount - minCount)) } result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) if glog.V(10) { // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is // not logged. There is visible performance gain from it. glog.V(10).Infof("%v -> %v: InterPodAffinityPriority, Score: (%d)", pod.Name, node.Name, int(fScore)) } } return result, nil }