func (c *MaxPDVolumeCountChecker) predicate(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo) (bool, error) { newVolumes := make(map[string]bool) if err := c.filterVolumes(pod.Spec.Volumes, pod.Namespace, newVolumes); err != nil { return false, err } // quick return if len(newVolumes) == 0 { return true, nil } // count unique volumes existingVolumes := make(map[string]bool) for _, existingPod := range nodeInfo.Pods() { if err := c.filterVolumes(existingPod.Spec.Volumes, existingPod.Namespace, existingVolumes); err != nil { return false, err } } numExistingVolumes := len(existingVolumes) // filter out already-mounted volumes for k := range existingVolumes { if _, ok := newVolumes[k]; ok { delete(newVolumes, k) } } numNewVolumes := len(newVolumes) if numExistingVolumes+numNewVolumes > c.maxVolumes { return false, nil } return true, nil }
// NoDiskConflict evaluates if a pod can fit due to the volumes it requests, and those that // are already mounted. If there is already a volume mounted on that node, another pod that uses the same volume // can't be scheduled there. // This is GCE, Amazon EBS, and Ceph RBD specific for now: // - GCE PD allows multiple mounts as long as they're all read-only // - AWS EBS forbids any two pods mounting the same volume ID // - Ceph RBD forbids if any two pods share at least same monitor, and match pool and image. // TODO: migrate this into some per-volume specific code? func NoDiskConflict(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { for _, v := range pod.Spec.Volumes { for _, ev := range nodeInfo.Pods() { if isVolumeConflict(v, ev) { return false, []algorithm.PredicateFailureReason{ErrDiskConflict}, nil } } } return true, nil, nil }
// NoDiskConflict evaluates if a pod can fit due to the volumes it requests, and those that // are already mounted. If there is already a volume mounted on that node, another pod that uses the same volume // can't be scheduled there. // This is GCE, Amazon EBS, and Ceph RBD specific for now: // - GCE PD allows multiple mounts as long as they're all read-only // - AWS EBS forbids any two pods mounting the same volume ID // - Ceph RBD forbids if any two pods share at least same monitor, and match pool and image. // TODO: migrate this into some per-volume specific code? func NoDiskConflict(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, error) { for _, v := range pod.Spec.Volumes { for _, ev := range nodeInfo.Pods() { if isVolumeConflict(v, ev) { return false, ErrDiskConflict } } } return true, nil }
// NoDiskConflict evaluates if a pod can fit due to the volumes it requests, and those that // are already mounted. If there is already a volume mounted on that node, another pod that uses the same volume // can't be scheduled there. // This is GCE, Amazon EBS, and Ceph RBD specific for now: // - GCE PD allows multiple mounts as long as they're all read-only // - AWS EBS forbids any two pods mounting the same volume ID // - Ceph RBD forbids if any two pods share at least same monitor, and match pool and image. // TODO: migrate this into some per-volume specific code? func NoDiskConflict(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo) (bool, error) { for _, v := range pod.Spec.Volumes { for _, ev := range nodeInfo.Pods() { if isVolumeConflict(v, ev) { return false, nil } } } return true, nil }
// FastGetPodsToMove returns a list of pods that should be moved elsewhere if the node // is drained. Raises error if there is an unreplicated pod and force option was not specified. // Based on kubectl drain code. It makes an assumption that RC, DS, Jobs and RS were deleted // along with their pods (no abandoned pods with dangling created-by annotation). Usefull for fast // checks. Doesn't check i func FastGetPodsToMove(nodeInfo *schedulercache.NodeInfo, skipNodesWithSystemPods bool, skipNodesWithLocalStorage bool) ([]*api.Pod, error) { return drain.GetPodsForDeletionOnNodeDrain( nodeInfo.Pods(), api.Codecs.UniversalDecoder(), skipNodesWithSystemPods, skipNodesWithLocalStorage, false, nil, 0) }
// FastGetPodsToMove returns a list of pods that should be moved elsewhere if the node // is drained. Raises error if there is an unreplicated pod and force option was not specified. // Based on kubectl drain code. It makes an assumption that RC, DS, Jobs and RS were deleted // along with their pods (no abandoned pods with dangling created-by annotation). Usefull for fast // checks. func FastGetPodsToMove(nodeInfo *schedulercache.NodeInfo, force bool, skipNodesWithSystemPods bool, skipNodesWithLocalStorage bool, decoder runtime.Decoder) ([]*api.Pod, error) { pods := make([]*api.Pod, 0) unreplicatedPodNames := []string{} for _, pod := range nodeInfo.Pods() { _, found := pod.ObjectMeta.Annotations[types.ConfigMirrorAnnotationKey] if found { // Skip mirror pod continue } replicated := false daemonsetPod := false creatorRef, found := pod.ObjectMeta.Annotations[controller.CreatedByAnnotation] if found { var sr api.SerializedReference if err := runtime.DecodeInto(decoder, []byte(creatorRef), &sr); err != nil { return []*api.Pod{}, err } if sr.Reference.Kind == "ReplicationController" { replicated = true } else if sr.Reference.Kind == "DaemonSet" { daemonsetPod = true } else if sr.Reference.Kind == "Job" { replicated = true } else if sr.Reference.Kind == "ReplicaSet" { replicated = true } } if !daemonsetPod && pod.Namespace == "kube-system" && skipNodesWithSystemPods { return []*api.Pod{}, fmt.Errorf("non-deamons set, non-mirrored, kube-system pod present: %s", pod.Name) } if !daemonsetPod && hasLocalStorage(pod) && skipNodesWithLocalStorage { return []*api.Pod{}, fmt.Errorf("pod with local storage present: %s", pod.Name) } switch { case daemonsetPod: break case !replicated: unreplicatedPodNames = append(unreplicatedPodNames, pod.Name) if force { pods = append(pods, pod) } default: pods = append(pods, pod) } } if !force && len(unreplicatedPodNames) > 0 { return []*api.Pod{}, fmt.Errorf("unreplicated pods present") } return pods, nil }
// DetailedGetPodsForMove returns a list of pods that should be moved elsewhere if the node // is drained. Raises error if there is an unreplicated pod and force option was not specified. // Based on kubectl drain code. It checks whether RC, DS, Jobs and RS that created these pods // still exist. func DetailedGetPodsForMove(nodeInfo *schedulercache.NodeInfo, skipNodesWithSystemPods bool, skipNodesWithLocalStorage bool, client *unversionedclient.Client, minReplicaCount int32) ([]*api.Pod, error) { return drain.GetPodsForDeletionOnNodeDrain( nodeInfo.Pods(), api.Codecs.UniversalDecoder(), skipNodesWithSystemPods, skipNodesWithLocalStorage, true, client, minReplicaCount) }
// FastGetPodsToMove returns a list of pods that should be moved elsewhere if the node // is drained. Raises error if there is an unreplicated pod and force option was not specified. // Based on kubectl drain code. It makes an assumption that RC, DS, Jobs and RS were deleted // along with their pods (no abandoned pods with dangling created-by annotation). Usefull for fast // checks. func FastGetPodsToMove(nodeInfo *schedulercache.NodeInfo, force bool, skipNodesWithSystemPods bool, skipNodesWithLocalStorage bool) ([]*api.Pod, error) { pods := make([]*api.Pod, 0) unreplicatedPodNames := []string{} for _, pod := range nodeInfo.Pods() { if IsMirrorPod(pod) { continue } replicated := false daemonsetPod := false creatorKind, err := CreatorRefKind(pod) if err != nil { return []*api.Pod{}, err } if creatorKind == "ReplicationController" { replicated = true } else if creatorKind == "DaemonSet" { daemonsetPod = true } else if creatorKind == "Job" { replicated = true } else if creatorKind == "ReplicaSet" { replicated = true } if !daemonsetPod && pod.Namespace == "kube-system" && skipNodesWithSystemPods { return []*api.Pod{}, fmt.Errorf("non-deamons set, non-mirrored, kube-system pod present: %s", pod.Name) } if !daemonsetPod && hasLocalStorage(pod) && skipNodesWithLocalStorage { return []*api.Pod{}, fmt.Errorf("pod with local storage present: %s", pod.Name) } switch { case daemonsetPod: break case !replicated: unreplicatedPodNames = append(unreplicatedPodNames, pod.Name) if force { pods = append(pods, pod) } default: pods = append(pods, pod) } } if !force && len(unreplicatedPodNames) > 0 { return []*api.Pod{}, fmt.Errorf("unreplicated pods present") } return pods, nil }
func (r *NodeStatus) PodFitsResources(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo) (bool, error) { info, err := r.info.GetNodeInfo(nodeName) if err != nil { return false, err } // TODO: move the following podNumber check to podFitsResourcesInternal when Kubelet allows podNumber check (See #20263). allocatable := info.Status.Allocatable allowedPodNumber := allocatable.Pods().Value() if int64(len(nodeInfo.Pods()))+1 > allowedPodNumber { return false, newInsufficientResourceError(podCountResourceName, 1, int64(len(nodeInfo.Pods())), allowedPodNumber) } return podFitsResourcesInternal(pod, nodeName, nodeInfo, info) }
func PodFitsHostPorts(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo) (bool, error) { wantPorts := getUsedPorts(pod) if len(wantPorts) == 0 { return true, nil } existingPorts := getUsedPorts(nodeInfo.Pods()...) for wport := range wantPorts { if wport == 0 { continue } if existingPorts[wport] { return false, ErrPodNotFitsHostPorts } } return true, nil }
func PodFitsResources(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { node := nodeInfo.Node() if node == nil { return false, nil, fmt.Errorf("node not found") } var predicateFails []algorithm.PredicateFailureReason allowedPodNumber := nodeInfo.AllowedPodNumber() if len(nodeInfo.Pods())+1 > allowedPodNumber { predicateFails = append(predicateFails, NewInsufficientResourceError(api.ResourcePods, 1, int64(len(nodeInfo.Pods())), int64(allowedPodNumber))) } var podRequest *schedulercache.Resource if predicateMeta, ok := meta.(*predicateMetadata); ok { podRequest = predicateMeta.podRequest } else { // We couldn't parse metadata - fallback to computing it. podRequest = GetResourceRequest(pod) } if podRequest.MilliCPU == 0 && podRequest.Memory == 0 && podRequest.NvidiaGPU == 0 && len(podRequest.OpaqueIntResources) == 0 { return len(predicateFails) == 0, predicateFails, nil } allocatable := nodeInfo.AllocatableResource() if allocatable.MilliCPU < podRequest.MilliCPU+nodeInfo.RequestedResource().MilliCPU { predicateFails = append(predicateFails, NewInsufficientResourceError(api.ResourceCPU, podRequest.MilliCPU, nodeInfo.RequestedResource().MilliCPU, allocatable.MilliCPU)) } if allocatable.Memory < podRequest.Memory+nodeInfo.RequestedResource().Memory { predicateFails = append(predicateFails, NewInsufficientResourceError(api.ResourceMemory, podRequest.Memory, nodeInfo.RequestedResource().Memory, allocatable.Memory)) } if allocatable.NvidiaGPU < podRequest.NvidiaGPU+nodeInfo.RequestedResource().NvidiaGPU { predicateFails = append(predicateFails, NewInsufficientResourceError(api.ResourceNvidiaGPU, podRequest.NvidiaGPU, nodeInfo.RequestedResource().NvidiaGPU, allocatable.NvidiaGPU)) } for rName, rQuant := range podRequest.OpaqueIntResources { if allocatable.OpaqueIntResources[rName] < rQuant+nodeInfo.RequestedResource().OpaqueIntResources[rName] { predicateFails = append(predicateFails, NewInsufficientResourceError(rName, podRequest.OpaqueIntResources[rName], nodeInfo.RequestedResource().OpaqueIntResources[rName], allocatable.OpaqueIntResources[rName])) } } if glog.V(10) { // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is // not logged. There is visible performance gain from it. glog.Infof("Schedule Pod %+v on Node %+v is allowed, Node is running only %v out of %v Pods.", podName(pod), node.Name, len(nodeInfo.Pods()), allowedPodNumber) } return len(predicateFails) == 0, predicateFails, nil }
func calculateReservationOfResource(node *kube_api.Node, nodeInfo *schedulercache.NodeInfo, resourceName kube_api.ResourceName) (float64, error) { nodeCapacity, found := node.Status.Capacity[resourceName] if !found { return 0, fmt.Errorf("Failed to get %v from %s", resourceName, node.Name) } if nodeCapacity.MilliValue() == 0 { return 0, fmt.Errorf("%v is 0 at %s", resourceName, node.Name) } podsRequest := resource.MustParse("0") for _, pod := range nodeInfo.Pods() { for _, container := range pod.Spec.Containers { if resourceValue, found := container.Resources.Requests[resourceName]; found { podsRequest.Add(resourceValue) } } } return float64(podsRequest.MilliValue()) / float64(nodeCapacity.MilliValue()), nil }
func (c *MaxPDVolumeCountChecker) predicate(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, error) { // If a pod doesn't have any volume attached to it, the predicate will always be true. // Thus we make a fast path for it, to avoid unnecessary computations in this case. if len(pod.Spec.Volumes) == 0 { return true, nil } newVolumes := make(map[string]bool) if err := c.filterVolumes(pod.Spec.Volumes, pod.Namespace, newVolumes); err != nil { return false, err } // quick return if len(newVolumes) == 0 { return true, nil } // count unique volumes existingVolumes := make(map[string]bool) for _, existingPod := range nodeInfo.Pods() { if err := c.filterVolumes(existingPod.Spec.Volumes, existingPod.Namespace, existingVolumes); err != nil { return false, err } } numExistingVolumes := len(existingVolumes) // filter out already-mounted volumes for k := range existingVolumes { if _, ok := newVolumes[k]; ok { delete(newVolumes, k) } } numNewVolumes := len(newVolumes) if numExistingVolumes+numNewVolumes > c.maxVolumes { // violates MaxEBSVolumeCount or MaxGCEPDVolumeCount return false, ErrMaxVolumeCountExceeded } return true, nil }
func PodFitsHostPorts(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, error) { var wantPorts map[int]bool if predicateMeta, ok := meta.(*predicateMetadata); ok { wantPorts = predicateMeta.podPorts } else { // We couldn't parse metadata - fallback to computing it. wantPorts = getUsedPorts(pod) } if len(wantPorts) == 0 { return true, nil } // TODO: Aggregate it at the NodeInfo level. existingPorts := getUsedPorts(nodeInfo.Pods()...) for wport := range wantPorts { if wport != 0 && existingPorts[wport] { return false, ErrPodNotFitsHostPorts } } return true, nil }
func podFitsResourcesInternal(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo, info *api.Node) (bool, error) { allocatable := info.Status.Allocatable allowedPodNumber := allocatable.Pods().Value() if int64(len(nodeInfo.Pods()))+1 > allowedPodNumber { return false, newInsufficientResourceError(podCountResourceName, 1, int64(len(nodeInfo.Pods())), allowedPodNumber) } podRequest := getResourceRequest(pod) if podRequest.milliCPU == 0 && podRequest.memory == 0 { return true, nil } totalMilliCPU := allocatable.Cpu().MilliValue() totalMemory := allocatable.Memory().Value() if totalMilliCPU < podRequest.milliCPU+nodeInfo.RequestedResource().MilliCPU { return false, newInsufficientResourceError(cpuResourceName, podRequest.milliCPU, nodeInfo.RequestedResource().MilliCPU, totalMilliCPU) } if totalMemory < podRequest.memory+nodeInfo.RequestedResource().Memory { return false, newInsufficientResourceError(memoryResoureceName, podRequest.memory, nodeInfo.RequestedResource().Memory, totalMemory) } glog.V(10).Infof("Schedule Pod %+v on Node %+v is allowed, Node is running only %v out of %v Pods.", podName(pod), nodeName, len(nodeInfo.Pods()), allowedPodNumber) return true, nil }
func PodFitsResources(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, error) { node := nodeInfo.Node() if node == nil { return false, fmt.Errorf("node not found") } allowedPodNumber := nodeInfo.AllowedPodNumber() if len(nodeInfo.Pods())+1 > allowedPodNumber { return false, newInsufficientResourceError(podCountResourceName, 1, int64(len(nodeInfo.Pods())), int64(allowedPodNumber)) } var podRequest *resourceRequest predicateMeta, ok := meta.(*predicateMetadata) if ok { podRequest = predicateMeta.podRequest } else { // We couldn't parse metadata - fallback to computing it. podRequest = getResourceRequest(pod) } if podRequest.milliCPU == 0 && podRequest.memory == 0 && podRequest.nvidiaGPU == 0 { return true, nil } allocatable := node.Status.Allocatable totalMilliCPU := allocatable.Cpu().MilliValue() totalMemory := allocatable.Memory().Value() totalNvidiaGPU := allocatable.NvidiaGPU().Value() if totalMilliCPU < podRequest.milliCPU+nodeInfo.RequestedResource().MilliCPU { return false, newInsufficientResourceError(cpuResourceName, podRequest.milliCPU, nodeInfo.RequestedResource().MilliCPU, totalMilliCPU) } if totalMemory < podRequest.memory+nodeInfo.RequestedResource().Memory { return false, newInsufficientResourceError(memoryResourceName, podRequest.memory, nodeInfo.RequestedResource().Memory, totalMemory) } if totalNvidiaGPU < podRequest.nvidiaGPU+nodeInfo.RequestedResource().NvidiaGPU { return false, newInsufficientResourceError(nvidiaGpuResourceName, podRequest.nvidiaGPU, nodeInfo.RequestedResource().NvidiaGPU, totalNvidiaGPU) } if glog.V(10) { // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is // not logged. There is visible performance gain from it. glog.Infof("Schedule Pod %+v on Node %+v is allowed, Node is running only %v out of %v Pods.", podName(pod), node.Name, len(nodeInfo.Pods()), allowedPodNumber) } return true, nil }
func PodFitsResources(pod *api.Pod, nodeInfo *schedulercache.NodeInfo) (bool, error) { node := nodeInfo.Node() if node == nil { return false, fmt.Errorf("node not found") } allocatable := node.Status.Allocatable allowedPodNumber := allocatable.Pods().Value() if int64(len(nodeInfo.Pods()))+1 > allowedPodNumber { return false, newInsufficientResourceError(podCountResourceName, 1, int64(len(nodeInfo.Pods())), allowedPodNumber) } podRequest := getResourceRequest(pod) if podRequest.milliCPU == 0 && podRequest.memory == 0 && podRequest.nvidiaGPU == 0 { return true, nil } totalMilliCPU := allocatable.Cpu().MilliValue() totalMemory := allocatable.Memory().Value() totalNvidiaGPU := allocatable.NvidiaGPU().Value() if totalMilliCPU < podRequest.milliCPU+nodeInfo.RequestedResource().MilliCPU { return false, newInsufficientResourceError(cpuResourceName, podRequest.milliCPU, nodeInfo.RequestedResource().MilliCPU, totalMilliCPU) } if totalMemory < podRequest.memory+nodeInfo.RequestedResource().Memory { return false, newInsufficientResourceError(memoryResourceName, podRequest.memory, nodeInfo.RequestedResource().Memory, totalMemory) } if totalNvidiaGPU < podRequest.nvidiaGPU+nodeInfo.RequestedResource().NvidiaGPU { return false, newInsufficientResourceError(nvidiaGpuResourceName, podRequest.nvidiaGPU, nodeInfo.RequestedResource().NvidiaGPU, totalNvidiaGPU) } glog.V(10).Infof("Schedule Pod %+v on Node %+v is allowed, Node is running only %v out of %v Pods.", podName(pod), node.Name, len(nodeInfo.Pods()), allowedPodNumber) return true, nil }
func hasNoPodsPredicate(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { if len(nodeInfo.Pods()) == 0 { return true, nil, nil } return false, []algorithm.PredicateFailureReason{algorithmpredicates.ErrFakePredicate}, nil }
func hasNoPodsPredicate(pod *api.Pod, nodeInfo *schedulercache.NodeInfo) (bool, error) { if len(nodeInfo.Pods()) == 0 { return true, nil } return false, algorithmpredicates.ErrFakePredicate }
func hasNoPodsPredicate(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo) (bool, error) { return len(nodeInfo.Pods()) == 0, nil }