func getMatchingAntiAffinityTerms(pod *v1.Pod, nodeInfoMap map[string]*schedulercache.NodeInfo) ([]matchingPodAntiAffinityTerm, error) { allNodeNames := make([]string, 0, len(nodeInfoMap)) for name := range nodeInfoMap { allNodeNames = append(allNodeNames, name) } var lock sync.Mutex var result []matchingPodAntiAffinityTerm var firstError error appendResult := func(toAppend []matchingPodAntiAffinityTerm) { lock.Lock() defer lock.Unlock() result = append(result, toAppend...) } catchError := func(err error) { lock.Lock() defer lock.Unlock() if firstError == nil { firstError = err } } processNode := func(i int) { nodeInfo := nodeInfoMap[allNodeNames[i]] node := nodeInfo.Node() if node == nil { catchError(fmt.Errorf("node not found")) return } var nodeResult []matchingPodAntiAffinityTerm for _, existingPod := range nodeInfo.PodsWithAffinity() { affinity, err := v1.GetAffinityFromPodAnnotations(existingPod.Annotations) if err != nil { catchError(err) return } if affinity == nil { continue } for _, term := range getPodAntiAffinityTerms(affinity.PodAntiAffinity) { namespaces := priorityutil.GetNamespacesFromPodAffinityTerm(pod, &term) selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector) if err != nil { catchError(err) return } match := priorityutil.PodMatchesTermsNamespaceAndSelector(pod, namespaces, selector) if match { nodeResult = append(nodeResult, matchingPodAntiAffinityTerm{term: &term, node: node}) } } } if len(nodeResult) > 0 { appendResult(nodeResult) } } workqueue.Parallelize(16, len(allNodeNames), processNode) return result, firstError }
// Filters the nodes to find the ones that fit based on the given predicate functions // Each node is passed through the predicate functions to determine if it is a fit func findNodesThatFit( pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodes []*api.Node, predicateFuncs map[string]algorithm.FitPredicate, extenders []algorithm.SchedulerExtender) ([]*api.Node, FailedPredicateMap, error) { var filtered []*api.Node failedPredicateMap := FailedPredicateMap{} if len(predicateFuncs) == 0 { filtered = nodes } else { // Create filtered list with enough space to avoid growing it // and allow assigning. filtered = make([]*api.Node, len(nodes)) meta := predicates.PredicateMetadata(pod, nodeNameToInfo) errs := []error{} var predicateResultLock sync.Mutex var filteredLen int32 checkNode := func(i int) { nodeName := nodes[i].Name fits, failedPredicate, err := podFitsOnNode(pod, meta, nodeNameToInfo[nodeName], predicateFuncs) if err != nil { predicateResultLock.Lock() errs = append(errs, err) predicateResultLock.Unlock() return } if fits { filtered[atomic.AddInt32(&filteredLen, 1)-1] = nodes[i] } else { predicateResultLock.Lock() failedPredicateMap[nodeName] = failedPredicate predicateResultLock.Unlock() } } workqueue.Parallelize(16, len(nodes), checkNode) filtered = filtered[:filteredLen] if len(errs) > 0 { return []*api.Node{}, FailedPredicateMap{}, errors.NewAggregate(errs) } } if len(filtered) > 0 && len(extenders) != 0 { for _, extender := range extenders { filteredList, err := extender.Filter(pod, filtered) if err != nil { return []*api.Node{}, FailedPredicateMap{}, err } filtered = filteredList if len(filtered) == 0 { break } } } return filtered, failedPredicateMap, nil }
func createPod(client clientset.Interface, namespace string, podCount int, podTemplate *api.Pod) error { var createError error lock := sync.Mutex{} createPodFunc := func(i int) { if err := makeCreatePod(client, namespace, podTemplate); err != nil { lock.Lock() defer lock.Unlock() createError = err } } if podCount < 30 { workqueue.Parallelize(podCount, podCount, createPodFunc) } else { workqueue.Parallelize(30, podCount, createPodFunc) } return createError }
// Filters the nodes to find the ones that fit based on the given predicate functions // Each node is passed through the predicate functions to determine if it is a fit func findNodesThatFit(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, predicateFuncs map[string]algorithm.FitPredicate, nodes api.NodeList, extenders []algorithm.SchedulerExtender) (api.NodeList, FailedPredicateMap, error) { // Create filtered list with enough space to avoid growing it. filtered := make([]api.Node, 0, len(nodes.Items)) failedPredicateMap := FailedPredicateMap{} if len(predicateFuncs) == 0 { filtered = nodes.Items } else { predicateResultLock := sync.Mutex{} errs := []error{} meta := predicates.PredicateMetadata(pod) checkNode := func(i int) { nodeName := nodes.Items[i].Name fits, failedPredicate, err := podFitsOnNode(pod, meta, nodeNameToInfo[nodeName], predicateFuncs) predicateResultLock.Lock() defer predicateResultLock.Unlock() if err != nil { errs = append(errs, err) return } if fits { filtered = append(filtered, nodes.Items[i]) } else { failedPredicateMap[nodeName] = failedPredicate } } workqueue.Parallelize(16, len(nodes.Items), checkNode) if len(errs) > 0 { return api.NodeList{}, FailedPredicateMap{}, errors.NewAggregate(errs) } } if len(filtered) > 0 && len(extenders) != 0 { for _, extender := range extenders { filteredList, err := extender.Filter(pod, &api.NodeList{Items: filtered}) if err != nil { return api.NodeList{}, FailedPredicateMap{}, err } filtered = filteredList.Items if len(filtered) == 0 { break } } } return api.NodeList{Items: filtered}, failedPredicateMap, nil }
// makePodsFromRC will create a ReplicationController object and // a given number of pods (imitating the controller). func makePodsFromRC(c client.Interface, name string, podCount int) { rc := &api.ReplicationController{ ObjectMeta: api.ObjectMeta{ Name: name, }, Spec: api.ReplicationControllerSpec{ Replicas: int32(podCount), Selector: map[string]string{"name": name}, Template: &api.PodTemplateSpec{ ObjectMeta: api.ObjectMeta{ Labels: map[string]string{"name": name}, }, Spec: makePodSpec(), }, }, } if _, err := c.ReplicationControllers("default").Create(rc); err != nil { glog.Fatalf("unexpected error: %v", err) } basePod := &api.Pod{ ObjectMeta: api.ObjectMeta{ GenerateName: "scheduler-test-pod-", Labels: map[string]string{"name": name}, }, Spec: makePodSpec(), } createPod := func(i int) { for { if _, err := c.Pods("default").Create(basePod); err == nil { break } } } workqueue.Parallelize(30, podCount, createPod) }
framework.PrintLatencies(watchLag, "worst watch latencies") framework.PrintLatencies(schedToWatchLag, "worst scheduled-to-end total latencies") framework.PrintLatencies(e2eLag, "worst e2e total latencies") // Test whether e2e pod startup time is acceptable. podStartupLatency := framework.PodStartupLatency{Latency: framework.ExtractLatencyMetrics(e2eLag)} framework.ExpectNoError(framework.VerifyPodStartupLatency(podStartupLatency)) framework.LogSuspiciousLatency(startupLag, e2eLag, nodeCount, c) By("Removing additional replication controllers") deleteRC := func(i int) { name := additionalPodsPrefix + "-" + strconv.Itoa(i+1) framework.ExpectNoError(framework.DeleteRCAndWaitForGC(c, ns, name)) } workqueue.Parallelize(16, nodeCount, deleteRC) } cleanupDensityTest(dConfig) }) } // Calculate total number of pods from each node's max-pod It("[Feature:ManualPerformance] should allow running maximum capacity pods on nodes", func() { totalPods = 0 for _, n := range nodes.Items { totalPods += int(n.Status.Capacity.Pods().Value()) } totalPods -= framework.WaitForStableCluster(c, masters) fileHndl, err := os.Create(fmt.Sprintf(framework.TestContext.OutputDir+"/%s/pod_states.csv", uuid))
// compute a sum by iterating through the elements of weightedPodAffinityTerm and adding // "weight" to the sum if the corresponding PodAffinityTerm is satisfied for // that node; the node(s) with the highest sum are the most preferred. // Symmetry need to be considered for preferredDuringSchedulingIgnoredDuringExecution from podAffinity & podAntiAffinity, // symmetry need to be considered for hard requirements from podAffinity func (ipa *InterPodAffinity) CalculateInterPodAffinityPriority(pod *v1.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodes []*v1.Node) (schedulerapi.HostPriorityList, error) { affinity := pod.Spec.Affinity hasAffinityConstraints := affinity != nil && affinity.PodAffinity != nil hasAntiAffinityConstraints := affinity != nil && affinity.PodAntiAffinity != nil allNodeNames := make([]string, 0, len(nodeNameToInfo)) for name := range nodeNameToInfo { allNodeNames = append(allNodeNames, name) } // convert the topology key based weights to the node name based weights var maxCount float64 var minCount float64 // priorityMap stores the mapping from node name to so-far computed score of // the node. pm := newPodAffinityPriorityMap(nodes, ipa.failureDomains) processPod := func(existingPod *v1.Pod) error { existingPodNode, err := ipa.info.GetNodeInfo(existingPod.Spec.NodeName) if err != nil { return err } existingPodAffinity := existingPod.Spec.Affinity existingHasAffinityConstraints := existingPodAffinity != nil && existingPodAffinity.PodAffinity != nil existingHasAntiAffinityConstraints := existingPodAffinity != nil && existingPodAffinity.PodAntiAffinity != nil if hasAffinityConstraints { // For every soft pod affinity term of <pod>, if <existingPod> matches the term, // increment <pm.counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPods>`s node by the term`s weight. terms := affinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution pm.processTerms(terms, pod, existingPod, existingPodNode, 1) } if hasAntiAffinityConstraints { // For every soft pod anti-affinity term of <pod>, if <existingPod> matches the term, // decrement <pm.counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>`s node by the term`s weight. terms := affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution pm.processTerms(terms, pod, existingPod, existingPodNode, -1) } if existingHasAffinityConstraints { // For every hard pod affinity term of <existingPod>, if <pod> matches the term, // increment <pm.counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>'s node by the constant <ipa.hardPodAffinityWeight> if ipa.hardPodAffinityWeight > 0 { terms := existingPodAffinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution. //if len(existingPodAffinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 { // terms = append(terms, existingPodAffinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution...) //} for _, term := range terms { pm.processTerm(&term, existingPod, pod, existingPodNode, float64(ipa.hardPodAffinityWeight)) } } // For every soft pod affinity term of <existingPod>, if <pod> matches the term, // increment <pm.counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>'s node by the term's weight. terms := existingPodAffinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution pm.processTerms(terms, existingPod, pod, existingPodNode, 1) } if existingHasAntiAffinityConstraints { // For every soft pod anti-affinity term of <existingPod>, if <pod> matches the term, // decrement <pm.counts> for every node in the cluster with the same <term.TopologyKey> // value as that of <existingPod>'s node by the term's weight. terms := existingPodAffinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution pm.processTerms(terms, existingPod, pod, existingPodNode, -1) } return nil } processNode := func(i int) { nodeInfo := nodeNameToInfo[allNodeNames[i]] if hasAffinityConstraints || hasAntiAffinityConstraints { // We need to process all the nodes. for _, existingPod := range nodeInfo.Pods() { if err := processPod(existingPod); err != nil { pm.setError(err) } } } else { // The pod doesn't have any constraints - we need to check only existing // ones that have some. for _, existingPod := range nodeInfo.PodsWithAffinity() { if err := processPod(existingPod); err != nil { pm.setError(err) } } } } workqueue.Parallelize(16, len(allNodeNames), processNode) if pm.firstError != nil { return nil, pm.firstError } for _, node := range nodes { if pm.counts[node.Name] > maxCount { maxCount = pm.counts[node.Name] } if pm.counts[node.Name] < minCount { minCount = pm.counts[node.Name] } } // calculate final priority score for each node result := make(schedulerapi.HostPriorityList, 0, len(nodes)) for _, node := range nodes { fScore := float64(0) if (maxCount - minCount) > 0 { fScore = 10 * ((pm.counts[node.Name] - minCount) / (maxCount - minCount)) } result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) if glog.V(10) { // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is // not logged. There is visible performance gain from it. glog.V(10).Infof("%v -> %v: InterPodAffinityPriority, Score: (%d)", pod.Name, node.Name, int(fScore)) } } return result, nil }
// CalculateSpreadPriority spreads pods across hosts and zones, considering pods belonging to the same service or replication controller. // When a pod is scheduled, it looks for services or RCs that match the pod, then finds existing pods that match those selectors. // It favors nodes that have fewer existing matching pods. // i.e. it pushes the scheduler towards a node where there's the smallest number of // pods which match the same service selectors or RC selectors as the pod being scheduled. // Where zone information is included on the nodes, it favors nodes in zones with fewer existing matching pods. func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodes []*api.Node) (schedulerapi.HostPriorityList, error) { selectors := make([]labels.Selector, 0, 3) if services, err := s.serviceLister.GetPodServices(pod); err == nil { for _, service := range services { selectors = append(selectors, labels.SelectorFromSet(service.Spec.Selector)) } } if rcs, err := s.controllerLister.GetPodControllers(pod); err == nil { for _, rc := range rcs { selectors = append(selectors, labels.SelectorFromSet(rc.Spec.Selector)) } } if rss, err := s.replicaSetLister.GetPodReplicaSets(pod); err == nil { for _, rs := range rss { if selector, err := unversioned.LabelSelectorAsSelector(rs.Spec.Selector); err == nil { selectors = append(selectors, selector) } } } // Count similar pods by node countsByNodeName := make(map[string]float32, len(nodes)) countsByZone := make(map[string]float32, 10) maxCountByNodeName := float32(0) countsByNodeNameLock := sync.Mutex{} if len(selectors) > 0 { processNodeFunc := func(i int) { nodeName := nodes[i].Name count := float32(0) for _, nodePod := range nodeNameToInfo[nodeName].Pods() { if pod.Namespace != nodePod.Namespace { continue } // When we are replacing a failed pod, we often see the previous // deleted version while scheduling the replacement. // Ignore the previous deleted version for spreading purposes // (it can still be considered for resource restrictions etc.) if nodePod.DeletionTimestamp != nil { glog.V(4).Infof("skipping pending-deleted pod: %s/%s", nodePod.Namespace, nodePod.Name) continue } matches := false for _, selector := range selectors { if selector.Matches(labels.Set(nodePod.ObjectMeta.Labels)) { matches = true break } } if matches { count++ } } zoneId := utilnode.GetZoneKey(nodes[i]) countsByNodeNameLock.Lock() defer countsByNodeNameLock.Unlock() countsByNodeName[nodeName] = count if count > maxCountByNodeName { maxCountByNodeName = count } if zoneId != "" { countsByZone[zoneId] += count } } workqueue.Parallelize(16, len(nodes), processNodeFunc) } // Aggregate by-zone information // Compute the maximum number of pods hosted in any zone haveZones := len(countsByZone) != 0 maxCountByZone := float32(0) for _, count := range countsByZone { if count > maxCountByZone { maxCountByZone = count } } result := make(schedulerapi.HostPriorityList, 0, len(nodes)) //score int - scale of 0-maxPriority // 0 being the lowest priority and maxPriority being the highest for _, node := range nodes { // initializing to the default/max node score of maxPriority fScore := maxPriority if maxCountByNodeName > 0 { fScore = maxPriority * ((maxCountByNodeName - countsByNodeName[node.Name]) / maxCountByNodeName) } // If there is zone information present, incorporate it if haveZones { zoneId := utilnode.GetZoneKey(node) if zoneId != "" { zoneScore := maxPriority * ((maxCountByZone - countsByZone[zoneId]) / maxCountByZone) fScore = (fScore * (1.0 - zoneWeighting)) + (zoneWeighting * zoneScore) } } result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) if glog.V(10) { // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is // not logged. There is visible performance gain from it. glog.V(10).Infof( "%v -> %v: SelectorSpreadPriority, Score: (%d)", pod.Name, node.Name, int(fScore), ) } } return result, nil }
// Prioritizes the nodes by running the individual priority functions in parallel. // Each priority function is expected to set a score of 0-10 // 0 is the lowest priority score (least preferred node) and 10 is the highest // Each priority function can also have its own weight // The node scores returned by the priority function are multiplied by the weights to get weighted scores // All scores are finally combined (added) to get the total weighted scores of all nodes func PrioritizeNodes( pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, meta interface{}, priorityConfigs []algorithm.PriorityConfig, nodes []*api.Node, extenders []algorithm.SchedulerExtender, ) (schedulerapi.HostPriorityList, error) { // If no priority configs are provided, then the EqualPriority function is applied // This is required to generate the priority list in the required format if len(priorityConfigs) == 0 && len(extenders) == 0 { return EqualPriority(pod, nodeNameToInfo, nodes) } var ( mu = sync.Mutex{} wg = sync.WaitGroup{} errs []error ) appendError := func(err error) { mu.Lock() defer mu.Unlock() errs = append(errs, err) } results := make([]schedulerapi.HostPriorityList, 0, len(priorityConfigs)) for range priorityConfigs { results = append(results, nil) } for i, priorityConfig := range priorityConfigs { if priorityConfig.Function != nil { // DEPRECATED wg.Add(1) go func(index int, config algorithm.PriorityConfig) { defer wg.Done() var err error results[index], err = config.Function(pod, nodeNameToInfo, nodes) if err != nil { appendError(err) } }(i, priorityConfig) } else { results[i] = make(schedulerapi.HostPriorityList, len(nodes)) } } processNode := func(index int) { nodeInfo := nodeNameToInfo[nodes[index].Name] var err error for i := range priorityConfigs { if priorityConfigs[i].Function != nil { continue } results[i][index], err = priorityConfigs[i].Map(pod, meta, nodeInfo) if err != nil { appendError(err) return } } } workqueue.Parallelize(16, len(nodes), processNode) for i, priorityConfig := range priorityConfigs { if priorityConfig.Reduce == nil { continue } wg.Add(1) go func(index int, config algorithm.PriorityConfig) { defer wg.Done() if err := config.Reduce(pod, results[index]); err != nil { appendError(err) } }(i, priorityConfig) } // Wait for all computations to be finished. wg.Wait() if len(errs) != 0 { return schedulerapi.HostPriorityList{}, errors.NewAggregate(errs) } // Summarize all scores. result := make(schedulerapi.HostPriorityList, 0, len(nodes)) // TODO: Consider parallelizing it. for i := range nodes { result = append(result, schedulerapi.HostPriority{Host: nodes[i].Name, Score: 0}) for j := range priorityConfigs { result[i].Score += results[j][i].Score * priorityConfigs[j].Weight } } if len(extenders) != 0 && nodes != nil { combinedScores := make(map[string]int, len(nodeNameToInfo)) for _, extender := range extenders { wg.Add(1) go func(ext algorithm.SchedulerExtender) { defer wg.Done() prioritizedList, weight, err := ext.Prioritize(pod, nodes) if err != nil { // Prioritization errors from extender can be ignored, let k8s/other extenders determine the priorities return } mu.Lock() for i := range *prioritizedList { host, score := (*prioritizedList)[i].Host, (*prioritizedList)[i].Score combinedScores[host] += score * weight } mu.Unlock() }(extender) } // wait for all go routines to finish wg.Wait() for i := range result { result[i].Score += combinedScores[result[i].Host] } } if glog.V(10) { for i := range result { glog.V(10).Infof("Host %s => Score %d", result[i].Host, result[i].Score) } } return result, nil }