// evictPods queues an eviction for the provided node name, and returns false if the node is already // queued for eviction. func (nc *NodeController) evictPods(node *api.Node) bool { if nc.zoneStates[utilnode.GetZoneKey(node)] == stateFullSegmentation { return false } nc.evictorLock.Lock() defer nc.evictorLock.Unlock() return nc.podEvictor.Add(node.Name) }
// Returns list of zones for all Nodes stored in FakeNodeHandler func getZones(nodeHandler *FakeNodeHandler) []string { nodes, _ := nodeHandler.List(api.ListOptions{}) zones := sets.NewString() for _, node := range nodes.Items { zones.Insert(utilnode.GetZoneKey(&node)) } return zones.List() }
// cancelPodEviction removes any queued evictions, typically because the node is available again. It // returns true if an eviction was queued. func (nc *NodeController) cancelPodEviction(node *v1.Node) bool { zone := utilnode.GetZoneKey(node) nc.evictorLock.Lock() defer nc.evictorLock.Unlock() wasDeleting := nc.zonePodEvictor[zone].Remove(node.Name) if wasDeleting { glog.V(2).Infof("Cancelling pod Eviction on Node: %v", node.Name) return true } return false }
// Run starts an asynchronous loop that monitors the status of cluster nodes. func (nc *NodeController) Run() { go func() { defer utilruntime.HandleCrash() if !cache.WaitForCacheSync(wait.NeverStop, nc.nodeInformer.Informer().HasSynced, nc.podInformer.Informer().HasSynced, nc.daemonSetInformer.Informer().HasSynced) { utilruntime.HandleError(errors.New("NodeController timed out while waiting for informers to sync...")) return } // Incorporate the results of node status pushed from kubelet to master. go wait.Until(func() { if err := nc.monitorNodeStatus(); err != nil { glog.Errorf("Error monitoring node status: %v", err) } }, nc.nodeMonitorPeriod, wait.NeverStop) // Managing eviction of nodes: // When we delete pods off a node, if the node was not empty at the time we then // queue an eviction watcher. If we hit an error, retry deletion. go wait.Until(func() { nc.evictorLock.Lock() defer nc.evictorLock.Unlock() for k := range nc.zonePodEvictor { nc.zonePodEvictor[k].Try(func(value TimedValue) (bool, time.Duration) { obj, exists, err := nc.nodeStore.GetByKey(value.Value) if err != nil { glog.Warningf("Failed to get Node %v from the nodeStore: %v", value.Value, err) } else if !exists { glog.Warningf("Node %v no longer present in nodeStore!", value.Value) } else { node, _ := obj.(*v1.Node) zone := utilnode.GetZoneKey(node) EvictionsNumber.WithLabelValues(zone).Inc() } nodeUid, _ := value.UID.(string) remaining, err := deletePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, nc.daemonSetStore) if err != nil { utilruntime.HandleError(fmt.Errorf("unable to evict node %q: %v", value.Value, err)) return false, 0 } if remaining { glog.Infof("Pods awaiting deletion due to NodeController eviction") } return true, 0 }) } }, nodeEvictionPeriod, wait.NeverStop) }() }
// cancelPodEviction removes any queued evictions, typically because the node is available again. It // returns true if an eviction was queued. func (nc *NodeController) cancelPodEviction(node *api.Node) bool { zone := utilnode.GetZoneKey(node) nc.evictorLock.Lock() defer nc.evictorLock.Unlock() wasDeleting := nc.zonePodEvictor[zone].Remove(node.Name) wasTerminating := nc.zoneTerminationEvictor[zone].Remove(node.Name) if wasDeleting || wasTerminating { glog.V(2).Infof("Cancelling pod Eviction on Node: %v", node.Name) nc.evictions10Minutes.removeEviction(zone, node.Name) nc.evictions1Hour.removeEviction(zone, node.Name) return true } return false }
// evictPods queues an eviction for the provided node name, and returns false if the node is already // queued for eviction. func (nc *NodeController) evictPods(node *api.Node) bool { nc.evictorLock.Lock() defer nc.evictorLock.Unlock() foundHealty := false for _, state := range nc.zoneStates { if state != stateFullSegmentation { foundHealty = true break } } if !foundHealty { return false } zone := utilnode.GetZoneKey(node) return nc.zonePodEvictor[zone].Add(node.Name) }
// evictPods queues an eviction for the provided node name, and returns false if the node is already // queued for eviction. func (nc *NodeController) evictPods(node *v1.Node) bool { nc.evictorLock.Lock() defer nc.evictorLock.Unlock() return nc.zonePodEvictor[utilnode.GetZoneKey(node)].Add(node.Name, string(node.UID)) }
// monitorNodeStatus verifies node status are constantly updated by kubelet, and if not, // post "NodeReady==ConditionUnknown". It also evicts all pods if node is not ready or // not reachable for a long period of time. func (nc *NodeController) monitorNodeStatus() error { // We are listing nodes from local cache as we can tolerate some small delays // comparing to state from etcd and there is eventual consistency anyway. nodes, err := nc.nodeStore.List() if err != nil { return err } added, deleted := nc.checkForNodeAddedDeleted(&nodes) for i := range added { glog.V(1).Infof("NodeController observed a new Node: %#v", added[i].Name) recordNodeEvent(nc.recorder, added[i].Name, string(added[i].UID), v1.EventTypeNormal, "RegisteredNode", fmt.Sprintf("Registered Node %v in NodeController", added[i].Name)) nc.knownNodeSet[added[i].Name] = added[i] // When adding new Nodes we need to check if new zone appeared, and if so add new evictor. zone := utilnode.GetZoneKey(added[i]) if _, found := nc.zonePodEvictor[zone]; !found { nc.zonePodEvictor[zone] = NewRateLimitedTimedQueue( flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst)) // Init the metric for the new zone. glog.Infof("Initializing eviction metric for zone: %v", zone) EvictionsNumber.WithLabelValues(zone).Add(0) } nc.cancelPodEviction(added[i]) } for i := range deleted { glog.V(1).Infof("NodeController observed a Node deletion: %v", deleted[i].Name) recordNodeEvent(nc.recorder, deleted[i].Name, string(deleted[i].UID), v1.EventTypeNormal, "RemovingNode", fmt.Sprintf("Removing Node %v from NodeController", deleted[i].Name)) delete(nc.knownNodeSet, deleted[i].Name) } zoneToNodeConditions := map[string][]*v1.NodeCondition{} for i := range nodes.Items { var gracePeriod time.Duration var observedReadyCondition v1.NodeCondition var currentReadyCondition *v1.NodeCondition nodeCopy, err := api.Scheme.DeepCopy(&nodes.Items[i]) if err != nil { utilruntime.HandleError(err) continue } node := nodeCopy.(*v1.Node) for rep := 0; rep < nodeStatusUpdateRetry; rep++ { gracePeriod, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeStatus(node) if err == nil { break } name := node.Name node, err = nc.kubeClient.Core().Nodes().Get(name, metav1.GetOptions{}) if err != nil { glog.Errorf("Failed while getting a Node to retry updating NodeStatus. Probably Node %s was deleted.", name) break } time.Sleep(retrySleepTime) } if err != nil { glog.Errorf("Update status of Node %v from NodeController exceeds retry count."+ "Skipping - no pods will be evicted.", node.Name) continue } // We do not treat a master node as a part of the cluster for network disruption checking. if !system.IsMasterNode(node.Name) { zoneToNodeConditions[utilnode.GetZoneKey(node)] = append(zoneToNodeConditions[utilnode.GetZoneKey(node)], currentReadyCondition) } decisionTimestamp := nc.now() if currentReadyCondition != nil { // Check eviction timeout against decisionTimestamp if observedReadyCondition.Status == v1.ConditionFalse && decisionTimestamp.After(nc.nodeStatusMap[node.Name].readyTransitionTimestamp.Add(nc.podEvictionTimeout)) { if nc.evictPods(node) { glog.V(2).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout) } } if observedReadyCondition.Status == v1.ConditionUnknown && decisionTimestamp.After(nc.nodeStatusMap[node.Name].probeTimestamp.Add(nc.podEvictionTimeout)) { if nc.evictPods(node) { glog.V(2).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout-gracePeriod) } } if observedReadyCondition.Status == v1.ConditionTrue { if nc.cancelPodEviction(node) { glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name) } } // Report node event. if currentReadyCondition.Status != v1.ConditionTrue && observedReadyCondition.Status == v1.ConditionTrue { recordNodeStatusChange(nc.recorder, node, "NodeNotReady") if err = markAllPodsNotReady(nc.kubeClient, node); err != nil { utilruntime.HandleError(fmt.Errorf("Unable to mark all pods NotReady on node %v: %v", node.Name, err)) } } // Check with the cloud provider to see if the node still exists. If it // doesn't, delete the node immediately. if currentReadyCondition.Status != v1.ConditionTrue && nc.cloud != nil { exists, err := nc.nodeExistsInCloudProvider(types.NodeName(node.Name)) if err != nil { glog.Errorf("Error determining if node %v exists in cloud: %v", node.Name, err) continue } if !exists { glog.V(2).Infof("Deleting node (no longer present in cloud provider): %s", node.Name) recordNodeEvent(nc.recorder, node.Name, string(node.UID), v1.EventTypeNormal, "DeletingNode", fmt.Sprintf("Deleting Node %v because it's not present according to cloud provider", node.Name)) go func(nodeName string) { defer utilruntime.HandleCrash() // Kubelet is not reporting and Cloud Provider says node // is gone. Delete it without worrying about grace // periods. if err := forcefullyDeleteNode(nc.kubeClient, nodeName); err != nil { glog.Errorf("Unable to forcefully delete node %q: %v", nodeName, err) } }(node.Name) } } } } nc.handleDisruption(zoneToNodeConditions, &nodes) return nil }
// Run starts an asynchronous loop that monitors the status of cluster nodes. func (nc *NodeController) Run() { go nc.nodeController.Run(wait.NeverStop) go nc.podController.Run(wait.NeverStop) go nc.daemonSetController.Run(wait.NeverStop) if nc.internalPodInformer != nil { go nc.internalPodInformer.Run(wait.NeverStop) } go func() { defer utilruntime.HandleCrash() if !cache.WaitForCacheSync(wait.NeverStop, nc.nodeController.HasSynced, nc.podController.HasSynced, nc.daemonSetController.HasSynced) { utilruntime.HandleError(errors.New("NodeController timed out while waiting for informers to sync...")) return } // Incorporate the results of node status pushed from kubelet to master. go wait.Until(func() { if err := nc.monitorNodeStatus(); err != nil { glog.Errorf("Error monitoring node status: %v", err) } }, nc.nodeMonitorPeriod, wait.NeverStop) // Managing eviction of nodes: // 1. when we delete pods off a node, if the node was not empty at the time we then // queue a termination watcher // a. If we hit an error, retry deletion // 2. The terminator loop ensures that pods are eventually cleaned and we never // terminate a pod in a time period less than nc.maximumGracePeriod. AddedAt // is the time from which we measure "has this pod been terminating too long", // after which we will delete the pod with grace period 0 (force delete). // a. If we hit errors, retry instantly // b. If there are no pods left terminating, exit // c. If there are pods still terminating, wait for their estimated completion // before retrying go wait.Until(func() { nc.evictorLock.Lock() defer nc.evictorLock.Unlock() for k := range nc.zonePodEvictor { nc.zonePodEvictor[k].Try(func(value TimedValue) (bool, time.Duration) { obj, exists, err := nc.nodeStore.GetByKey(value.Value) if err != nil { glog.Warningf("Failed to get Node %v from the nodeStore: %v", value.Value, err) } else if !exists { glog.Warningf("Node %v no longer present in nodeStore!", value.Value) } else { node, _ := obj.(*api.Node) zone := utilnode.GetZoneKey(node) EvictionsNumber.WithLabelValues(zone).Inc() } nodeUid, _ := value.UID.(string) remaining, err := deletePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, nc.daemonSetStore) if err != nil { utilruntime.HandleError(fmt.Errorf("unable to evict node %q: %v", value.Value, err)) return false, 0 } if remaining { nc.zoneTerminationEvictor[k].Add(value.Value, value.UID) } return true, 0 }) } }, nodeEvictionPeriod, wait.NeverStop) // TODO: replace with a controller that ensures pods that are terminating complete // in a particular time period go wait.Until(func() { nc.evictorLock.Lock() defer nc.evictorLock.Unlock() for k := range nc.zoneTerminationEvictor { nc.zoneTerminationEvictor[k].Try(func(value TimedValue) (bool, time.Duration) { nodeUid, _ := value.UID.(string) completed, remaining, err := terminatePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, value.AddedAt, nc.maximumGracePeriod) if err != nil { utilruntime.HandleError(fmt.Errorf("unable to terminate pods on node %q: %v", value.Value, err)) return false, 0 } if completed { glog.V(2).Infof("All pods terminated on %s", value.Value) recordNodeEvent(nc.recorder, value.Value, nodeUid, api.EventTypeNormal, "TerminatedAllPods", fmt.Sprintf("Terminated all Pods on Node %s.", value.Value)) return true, 0 } glog.V(2).Infof("Pods terminating since %s on %q, estimated completion %s", value.AddedAt, value.Value, remaining) // clamp very short intervals if remaining < nodeEvictionPeriod { remaining = nodeEvictionPeriod } return false, remaining }) } }, nodeEvictionPeriod, wait.NeverStop) go wait.Until(func() { pods, err := nc.podStore.List(labels.Everything()) if err != nil { utilruntime.HandleError(err) return } cleanupOrphanedPods(pods, nc.nodeStore.Store, nc.forcefullyDeletePod) }, 30*time.Second, wait.NeverStop) }() }
// CalculateSpreadPriority spreads pods across hosts and zones, considering pods belonging to the same service or replication controller. // When a pod is scheduled, it looks for services or RCs that match the pod, then finds existing pods that match those selectors. // It favors nodes that have fewer existing matching pods. // i.e. it pushes the scheduler towards a node where there's the smallest number of // pods which match the same service selectors or RC selectors as the pod being scheduled. // Where zone information is included on the nodes, it favors nodes in zones with fewer existing matching pods. func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodes []*api.Node) (schedulerapi.HostPriorityList, error) { selectors := make([]labels.Selector, 0, 3) if services, err := s.serviceLister.GetPodServices(pod); err == nil { for _, service := range services { selectors = append(selectors, labels.SelectorFromSet(service.Spec.Selector)) } } if rcs, err := s.controllerLister.GetPodControllers(pod); err == nil { for _, rc := range rcs { selectors = append(selectors, labels.SelectorFromSet(rc.Spec.Selector)) } } if rss, err := s.replicaSetLister.GetPodReplicaSets(pod); err == nil { for _, rs := range rss { if selector, err := unversioned.LabelSelectorAsSelector(rs.Spec.Selector); err == nil { selectors = append(selectors, selector) } } } // Count similar pods by node countsByNodeName := make(map[string]float32, len(nodes)) countsByZone := make(map[string]float32, 10) maxCountByNodeName := float32(0) countsByNodeNameLock := sync.Mutex{} if len(selectors) > 0 { processNodeFunc := func(i int) { nodeName := nodes[i].Name count := float32(0) for _, nodePod := range nodeNameToInfo[nodeName].Pods() { if pod.Namespace != nodePod.Namespace { continue } // When we are replacing a failed pod, we often see the previous // deleted version while scheduling the replacement. // Ignore the previous deleted version for spreading purposes // (it can still be considered for resource restrictions etc.) if nodePod.DeletionTimestamp != nil { glog.V(4).Infof("skipping pending-deleted pod: %s/%s", nodePod.Namespace, nodePod.Name) continue } matches := false for _, selector := range selectors { if selector.Matches(labels.Set(nodePod.ObjectMeta.Labels)) { matches = true break } } if matches { count++ } } zoneId := utilnode.GetZoneKey(nodes[i]) countsByNodeNameLock.Lock() defer countsByNodeNameLock.Unlock() countsByNodeName[nodeName] = count if count > maxCountByNodeName { maxCountByNodeName = count } if zoneId != "" { countsByZone[zoneId] += count } } workqueue.Parallelize(16, len(nodes), processNodeFunc) } // Aggregate by-zone information // Compute the maximum number of pods hosted in any zone haveZones := len(countsByZone) != 0 maxCountByZone := float32(0) for _, count := range countsByZone { if count > maxCountByZone { maxCountByZone = count } } result := make(schedulerapi.HostPriorityList, 0, len(nodes)) //score int - scale of 0-maxPriority // 0 being the lowest priority and maxPriority being the highest for _, node := range nodes { // initializing to the default/max node score of maxPriority fScore := maxPriority if maxCountByNodeName > 0 { fScore = maxPriority * ((maxCountByNodeName - countsByNodeName[node.Name]) / maxCountByNodeName) } // If there is zone information present, incorporate it if haveZones { zoneId := utilnode.GetZoneKey(node) if zoneId != "" { zoneScore := maxPriority * ((maxCountByZone - countsByZone[zoneId]) / maxCountByZone) fScore = (fScore * (1.0 - zoneWeighting)) + (zoneWeighting * zoneScore) } } result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) if glog.V(10) { // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is // not logged. There is visible performance gain from it. glog.V(10).Infof( "%v -> %v: SelectorSpreadPriority, Score: (%d)", pod.Name, node.Name, int(fScore), ) } } return result, nil }
// monitorNodeStatus verifies node status are constantly updated by kubelet, and if not, // post "NodeReady==ConditionUnknown". It also evicts all pods if node is not ready or // not reachable for a long period of time. func (nc *NodeController) monitorNodeStatus() error { nodes, err := nc.kubeClient.Core().Nodes().List(api.ListOptions{}) if err != nil { return err } added, deleted := nc.checkForNodeAddedDeleted(nodes) for i := range added { glog.V(1).Infof("NodeController observed a new Node: %#v", added[i].Name) recordNodeEvent(nc.recorder, added[i].Name, api.EventTypeNormal, "RegisteredNode", fmt.Sprintf("Registered Node %v in NodeController", added[i].Name)) nc.knownNodeSet[added[i].Name] = added[i] // When adding new Nodes we need to check if new zone appeared, and if so add new evictor. zone := utilnode.GetZoneKey(added[i]) if _, found := nc.zonePodEvictor[zone]; !found { nc.zonePodEvictor[zone] = NewRateLimitedTimedQueue( flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst)) } if _, found := nc.zoneTerminationEvictor[zone]; !found { nc.zoneTerminationEvictor[zone] = NewRateLimitedTimedQueue( flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst)) } nc.cancelPodEviction(added[i]) } for i := range deleted { glog.V(1).Infof("NodeController observed a Node deletion: %v", deleted[i].Name) recordNodeEvent(nc.recorder, deleted[i].Name, api.EventTypeNormal, "RemovingNode", fmt.Sprintf("Removing Node %v from NodeController", deleted[i].Name)) nc.evictPods(deleted[i]) delete(nc.knownNodeSet, deleted[i].Name) } zoneToNodeConditions := map[string][]*api.NodeCondition{} for i := range nodes.Items { var gracePeriod time.Duration var observedReadyCondition api.NodeCondition var currentReadyCondition *api.NodeCondition node := &nodes.Items[i] for rep := 0; rep < nodeStatusUpdateRetry; rep++ { gracePeriod, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeStatus(node) if err == nil { break } name := node.Name node, err = nc.kubeClient.Core().Nodes().Get(name) if err != nil { glog.Errorf("Failed while getting a Node to retry updating NodeStatus. Probably Node %s was deleted.", name) break } } if err != nil { glog.Errorf("Update status of Node %v from NodeController exceeds retry count."+ "Skipping - no pods will be evicted.", node.Name) continue } // We do not treat a master node as a part of the cluster for network segmentation checking. if !system.IsMasterNode(node) { zoneToNodeConditions[utilnode.GetZoneKey(node)] = append(zoneToNodeConditions[utilnode.GetZoneKey(node)], currentReadyCondition) } decisionTimestamp := nc.now() if currentReadyCondition != nil { // Check eviction timeout against decisionTimestamp if observedReadyCondition.Status == api.ConditionFalse && decisionTimestamp.After(nc.nodeStatusMap[node.Name].readyTransitionTimestamp.Add(nc.podEvictionTimeout)) { if nc.evictPods(node) { glog.V(4).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout) } } if observedReadyCondition.Status == api.ConditionUnknown && decisionTimestamp.After(nc.nodeStatusMap[node.Name].probeTimestamp.Add(nc.podEvictionTimeout)) { if nc.evictPods(node) { glog.V(4).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout-gracePeriod) } } if observedReadyCondition.Status == api.ConditionTrue { if nc.cancelPodEviction(node) { glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name) } } // Report node event. if currentReadyCondition.Status != api.ConditionTrue && observedReadyCondition.Status == api.ConditionTrue { recordNodeStatusChange(nc.recorder, node, "NodeNotReady") if err = markAllPodsNotReady(nc.kubeClient, node.Name); err != nil { utilruntime.HandleError(fmt.Errorf("Unable to mark all pods NotReady on node %v: %v", node.Name, err)) } } // Check with the cloud provider to see if the node still exists. If it // doesn't, delete the node immediately. if currentReadyCondition.Status != api.ConditionTrue && nc.cloud != nil { exists, err := nc.nodeExistsInCloudProvider(node.Name) if err != nil { glog.Errorf("Error determining if node %v exists in cloud: %v", node.Name, err) continue } if !exists { glog.V(2).Infof("Deleting node (no longer present in cloud provider): %s", node.Name) recordNodeEvent(nc.recorder, node.Name, api.EventTypeNormal, "DeletingNode", fmt.Sprintf("Deleting Node %v because it's not present according to cloud provider", node.Name)) go func(nodeName string) { defer utilruntime.HandleCrash() // Kubelet is not reporting and Cloud Provider says node // is gone. Delete it without worrying about grace // periods. if err := forcefullyDeleteNode(nc.kubeClient, nodeName, nc.forcefullyDeletePod); err != nil { glog.Errorf("Unable to forcefully delete node %q: %v", nodeName, err) } }(node.Name) continue } } } } for k, v := range zoneToNodeConditions { newState := nc.computeZoneStateFunc(v) if newState == nc.zoneStates[k] { continue } if newState == stateFullSegmentation { glog.V(2).Infof("NodeController is entering network segmentation mode in zone %v.", k) } else if newState == stateNormal { glog.V(2).Infof("NodeController exited network segmentation mode in zone %v.", k) } for i := range nodes.Items { if utilnode.GetZoneKey(&nodes.Items[i]) == k { if newState == stateFullSegmentation { // When zone is fully segmented we stop the eviction all together. nc.cancelPodEviction(&nodes.Items[i]) } if newState == stateNormal && nc.zoneStates[k] == stateFullSegmentation { // When exiting segmentation mode update probe timestamps on all Nodes. now := nc.now() v := nc.nodeStatusMap[nodes.Items[i].Name] v.probeTimestamp = now v.readyTransitionTimestamp = now nc.nodeStatusMap[nodes.Items[i].Name] = v } } } nc.zoneStates[k] = newState } return nil }
// CalculateSpreadPriority spreads pods across hosts and zones, considering pods belonging to the same service or replication controller. // When a pod is scheduled, it looks for services or RCs that match the pod, then finds existing pods that match those selectors. // It favors nodes that have fewer existing matching pods. // i.e. it pushes the scheduler towards a node where there's the smallest number of // pods which match the same service selectors or RC selectors as the pod being scheduled. // Where zone information is included on the nodes, it favors nodes in zones with fewer existing matching pods. func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { selectors := make([]labels.Selector, 0) services, err := s.serviceLister.GetPodServices(pod) if err == nil { for _, service := range services { selectors = append(selectors, labels.SelectorFromSet(service.Spec.Selector)) } } rcs, err := s.controllerLister.GetPodControllers(pod) if err == nil { for _, rc := range rcs { selectors = append(selectors, labels.SelectorFromSet(rc.Spec.Selector)) } } rss, err := s.replicaSetLister.GetPodReplicaSets(pod) if err == nil { for _, rs := range rss { if selector, err := unversioned.LabelSelectorAsSelector(rs.Spec.Selector); err == nil { selectors = append(selectors, selector) } } } nodes, err := nodeLister.List() if err != nil { return nil, err } // Count similar pods by node countsByNodeName := map[string]int{} countsByNodeNameLock := sync.Mutex{} if len(selectors) > 0 { // Create a number of go-routines that will be computing number // of "similar" pods for given nodes. workers := 16 toProcess := make(chan string, len(nodes)) for i := range nodes { toProcess <- nodes[i].Name } close(toProcess) // TODO: Use Parallelize. wg := sync.WaitGroup{} wg.Add(workers) for i := 0; i < workers; i++ { go func() { defer utilruntime.HandleCrash() defer wg.Done() for { nodeName, ok := <-toProcess if !ok { return } count := 0 for _, nodePod := range nodeNameToInfo[nodeName].Pods() { if pod.Namespace != nodePod.Namespace { continue } // When we are replacing a failed pod, we often see the previous // deleted version while scheduling the replacement. // Ignore the previous deleted version for spreading purposes // (it can still be considered for resource restrictions etc.) if nodePod.DeletionTimestamp != nil { glog.V(4).Infof("skipping pending-deleted pod: %s/%s", nodePod.Namespace, nodePod.Name) continue } matches := false for _, selector := range selectors { if selector.Matches(labels.Set(nodePod.ObjectMeta.Labels)) { matches = true break } } if matches { count++ } } func() { countsByNodeNameLock.Lock() defer countsByNodeNameLock.Unlock() countsByNodeName[nodeName] = count }() } }() } wg.Wait() } // Aggregate by-node information // Compute the maximum number of pods hosted on any node maxCountByNodeName := 0 for _, count := range countsByNodeName { if count > maxCountByNodeName { maxCountByNodeName = count } } // Count similar pods by zone, if zone information is present countsByZone := map[string]int{} for _, node := range nodes { count, found := countsByNodeName[node.Name] if !found { continue } zoneId := utilnode.GetZoneKey(node) if zoneId == "" { continue } countsByZone[zoneId] += count } // Aggregate by-zone information // Compute the maximum number of pods hosted in any zone haveZones := len(countsByZone) != 0 maxCountByZone := 0 for _, count := range countsByZone { if count > maxCountByZone { maxCountByZone = count } } result := make(schedulerapi.HostPriorityList, 0, len(nodes)) //score int - scale of 0-maxPriority // 0 being the lowest priority and maxPriority being the highest for _, node := range nodes { // initializing to the default/max node score of maxPriority fScore := float32(maxPriority) if maxCountByNodeName > 0 { fScore = maxPriority * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName)) } // If there is zone information present, incorporate it if haveZones { zoneId := utilnode.GetZoneKey(node) if zoneId != "" { zoneScore := maxPriority * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone)) fScore = (fScore * (1.0 - zoneWeighting)) + (zoneWeighting * zoneScore) } } result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) glog.V(10).Infof( "%v -> %v: SelectorSpreadPriority, Score: (%d)", pod.Name, node.Name, int(fScore), ) } return result, nil }