Example #1
0
// evictPods queues an eviction for the provided node name, and returns false if the node is already
// queued for eviction.
func (nc *NodeController) evictPods(node *api.Node) bool {
	if nc.zoneStates[utilnode.GetZoneKey(node)] == stateFullSegmentation {
		return false
	}
	nc.evictorLock.Lock()
	defer nc.evictorLock.Unlock()
	return nc.podEvictor.Add(node.Name)
}
// Returns list of zones for all Nodes stored in FakeNodeHandler
func getZones(nodeHandler *FakeNodeHandler) []string {
	nodes, _ := nodeHandler.List(api.ListOptions{})
	zones := sets.NewString()
	for _, node := range nodes.Items {
		zones.Insert(utilnode.GetZoneKey(&node))
	}
	return zones.List()
}
Example #3
0
// cancelPodEviction removes any queued evictions, typically because the node is available again. It
// returns true if an eviction was queued.
func (nc *NodeController) cancelPodEviction(node *v1.Node) bool {
	zone := utilnode.GetZoneKey(node)
	nc.evictorLock.Lock()
	defer nc.evictorLock.Unlock()
	wasDeleting := nc.zonePodEvictor[zone].Remove(node.Name)
	if wasDeleting {
		glog.V(2).Infof("Cancelling pod Eviction on Node: %v", node.Name)
		return true
	}
	return false
}
Example #4
0
// Run starts an asynchronous loop that monitors the status of cluster nodes.
func (nc *NodeController) Run() {
	go func() {
		defer utilruntime.HandleCrash()

		if !cache.WaitForCacheSync(wait.NeverStop, nc.nodeInformer.Informer().HasSynced, nc.podInformer.Informer().HasSynced, nc.daemonSetInformer.Informer().HasSynced) {
			utilruntime.HandleError(errors.New("NodeController timed out while waiting for informers to sync..."))
			return
		}

		// Incorporate the results of node status pushed from kubelet to master.
		go wait.Until(func() {
			if err := nc.monitorNodeStatus(); err != nil {
				glog.Errorf("Error monitoring node status: %v", err)
			}
		}, nc.nodeMonitorPeriod, wait.NeverStop)

		// Managing eviction of nodes:
		// When we delete pods off a node, if the node was not empty at the time we then
		// queue an eviction watcher. If we hit an error, retry deletion.
		go wait.Until(func() {
			nc.evictorLock.Lock()
			defer nc.evictorLock.Unlock()
			for k := range nc.zonePodEvictor {
				nc.zonePodEvictor[k].Try(func(value TimedValue) (bool, time.Duration) {
					obj, exists, err := nc.nodeStore.GetByKey(value.Value)
					if err != nil {
						glog.Warningf("Failed to get Node %v from the nodeStore: %v", value.Value, err)
					} else if !exists {
						glog.Warningf("Node %v no longer present in nodeStore!", value.Value)
					} else {
						node, _ := obj.(*v1.Node)
						zone := utilnode.GetZoneKey(node)
						EvictionsNumber.WithLabelValues(zone).Inc()
					}

					nodeUid, _ := value.UID.(string)
					remaining, err := deletePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, nc.daemonSetStore)
					if err != nil {
						utilruntime.HandleError(fmt.Errorf("unable to evict node %q: %v", value.Value, err))
						return false, 0
					}

					if remaining {
						glog.Infof("Pods awaiting deletion due to NodeController eviction")
					}
					return true, 0
				})
			}
		}, nodeEvictionPeriod, wait.NeverStop)
	}()
}
Example #5
0
// cancelPodEviction removes any queued evictions, typically because the node is available again. It
// returns true if an eviction was queued.
func (nc *NodeController) cancelPodEviction(node *api.Node) bool {
	zone := utilnode.GetZoneKey(node)
	nc.evictorLock.Lock()
	defer nc.evictorLock.Unlock()
	wasDeleting := nc.zonePodEvictor[zone].Remove(node.Name)
	wasTerminating := nc.zoneTerminationEvictor[zone].Remove(node.Name)
	if wasDeleting || wasTerminating {
		glog.V(2).Infof("Cancelling pod Eviction on Node: %v", node.Name)
		nc.evictions10Minutes.removeEviction(zone, node.Name)
		nc.evictions1Hour.removeEviction(zone, node.Name)
		return true
	}
	return false
}
Example #6
0
// evictPods queues an eviction for the provided node name, and returns false if the node is already
// queued for eviction.
func (nc *NodeController) evictPods(node *api.Node) bool {
	nc.evictorLock.Lock()
	defer nc.evictorLock.Unlock()
	foundHealty := false
	for _, state := range nc.zoneStates {
		if state != stateFullSegmentation {
			foundHealty = true
			break
		}
	}
	if !foundHealty {
		return false
	}
	zone := utilnode.GetZoneKey(node)
	return nc.zonePodEvictor[zone].Add(node.Name)
}
Example #7
0
// evictPods queues an eviction for the provided node name, and returns false if the node is already
// queued for eviction.
func (nc *NodeController) evictPods(node *v1.Node) bool {
	nc.evictorLock.Lock()
	defer nc.evictorLock.Unlock()
	return nc.zonePodEvictor[utilnode.GetZoneKey(node)].Add(node.Name, string(node.UID))
}
Example #8
0
// monitorNodeStatus verifies node status are constantly updated by kubelet, and if not,
// post "NodeReady==ConditionUnknown". It also evicts all pods if node is not ready or
// not reachable for a long period of time.
func (nc *NodeController) monitorNodeStatus() error {
	// We are listing nodes from local cache as we can tolerate some small delays
	// comparing to state from etcd and there is eventual consistency anyway.
	nodes, err := nc.nodeStore.List()
	if err != nil {
		return err
	}
	added, deleted := nc.checkForNodeAddedDeleted(&nodes)
	for i := range added {
		glog.V(1).Infof("NodeController observed a new Node: %#v", added[i].Name)
		recordNodeEvent(nc.recorder, added[i].Name, string(added[i].UID), v1.EventTypeNormal, "RegisteredNode", fmt.Sprintf("Registered Node %v in NodeController", added[i].Name))
		nc.knownNodeSet[added[i].Name] = added[i]
		// When adding new Nodes we need to check if new zone appeared, and if so add new evictor.
		zone := utilnode.GetZoneKey(added[i])
		if _, found := nc.zonePodEvictor[zone]; !found {
			nc.zonePodEvictor[zone] =
				NewRateLimitedTimedQueue(
					flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst))
			// Init the metric for the new zone.
			glog.Infof("Initializing eviction metric for zone: %v", zone)
			EvictionsNumber.WithLabelValues(zone).Add(0)
		}
		nc.cancelPodEviction(added[i])
	}

	for i := range deleted {
		glog.V(1).Infof("NodeController observed a Node deletion: %v", deleted[i].Name)
		recordNodeEvent(nc.recorder, deleted[i].Name, string(deleted[i].UID), v1.EventTypeNormal, "RemovingNode", fmt.Sprintf("Removing Node %v from NodeController", deleted[i].Name))
		delete(nc.knownNodeSet, deleted[i].Name)
	}

	zoneToNodeConditions := map[string][]*v1.NodeCondition{}
	for i := range nodes.Items {
		var gracePeriod time.Duration
		var observedReadyCondition v1.NodeCondition
		var currentReadyCondition *v1.NodeCondition
		nodeCopy, err := api.Scheme.DeepCopy(&nodes.Items[i])
		if err != nil {
			utilruntime.HandleError(err)
			continue
		}
		node := nodeCopy.(*v1.Node)
		for rep := 0; rep < nodeStatusUpdateRetry; rep++ {
			gracePeriod, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeStatus(node)
			if err == nil {
				break
			}
			name := node.Name
			node, err = nc.kubeClient.Core().Nodes().Get(name, metav1.GetOptions{})
			if err != nil {
				glog.Errorf("Failed while getting a Node to retry updating NodeStatus. Probably Node %s was deleted.", name)
				break
			}
			time.Sleep(retrySleepTime)
		}
		if err != nil {
			glog.Errorf("Update status  of Node %v from NodeController exceeds retry count."+
				"Skipping - no pods will be evicted.", node.Name)
			continue
		}
		// We do not treat a master node as a part of the cluster for network disruption checking.
		if !system.IsMasterNode(node.Name) {
			zoneToNodeConditions[utilnode.GetZoneKey(node)] = append(zoneToNodeConditions[utilnode.GetZoneKey(node)], currentReadyCondition)
		}

		decisionTimestamp := nc.now()
		if currentReadyCondition != nil {
			// Check eviction timeout against decisionTimestamp
			if observedReadyCondition.Status == v1.ConditionFalse &&
				decisionTimestamp.After(nc.nodeStatusMap[node.Name].readyTransitionTimestamp.Add(nc.podEvictionTimeout)) {
				if nc.evictPods(node) {
					glog.V(2).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout)
				}
			}
			if observedReadyCondition.Status == v1.ConditionUnknown &&
				decisionTimestamp.After(nc.nodeStatusMap[node.Name].probeTimestamp.Add(nc.podEvictionTimeout)) {
				if nc.evictPods(node) {
					glog.V(2).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout-gracePeriod)
				}
			}
			if observedReadyCondition.Status == v1.ConditionTrue {
				if nc.cancelPodEviction(node) {
					glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name)
				}
			}

			// Report node event.
			if currentReadyCondition.Status != v1.ConditionTrue && observedReadyCondition.Status == v1.ConditionTrue {
				recordNodeStatusChange(nc.recorder, node, "NodeNotReady")
				if err = markAllPodsNotReady(nc.kubeClient, node); err != nil {
					utilruntime.HandleError(fmt.Errorf("Unable to mark all pods NotReady on node %v: %v", node.Name, err))
				}
			}

			// Check with the cloud provider to see if the node still exists. If it
			// doesn't, delete the node immediately.
			if currentReadyCondition.Status != v1.ConditionTrue && nc.cloud != nil {
				exists, err := nc.nodeExistsInCloudProvider(types.NodeName(node.Name))
				if err != nil {
					glog.Errorf("Error determining if node %v exists in cloud: %v", node.Name, err)
					continue
				}
				if !exists {
					glog.V(2).Infof("Deleting node (no longer present in cloud provider): %s", node.Name)
					recordNodeEvent(nc.recorder, node.Name, string(node.UID), v1.EventTypeNormal, "DeletingNode", fmt.Sprintf("Deleting Node %v because it's not present according to cloud provider", node.Name))
					go func(nodeName string) {
						defer utilruntime.HandleCrash()
						// Kubelet is not reporting and Cloud Provider says node
						// is gone. Delete it without worrying about grace
						// periods.
						if err := forcefullyDeleteNode(nc.kubeClient, nodeName); err != nil {
							glog.Errorf("Unable to forcefully delete node %q: %v", nodeName, err)
						}
					}(node.Name)
				}
			}
		}
	}
	nc.handleDisruption(zoneToNodeConditions, &nodes)

	return nil
}
// Run starts an asynchronous loop that monitors the status of cluster nodes.
func (nc *NodeController) Run() {
	go nc.nodeController.Run(wait.NeverStop)
	go nc.podController.Run(wait.NeverStop)
	go nc.daemonSetController.Run(wait.NeverStop)
	if nc.internalPodInformer != nil {
		go nc.internalPodInformer.Run(wait.NeverStop)
	}

	go func() {
		defer utilruntime.HandleCrash()

		if !cache.WaitForCacheSync(wait.NeverStop, nc.nodeController.HasSynced, nc.podController.HasSynced, nc.daemonSetController.HasSynced) {
			utilruntime.HandleError(errors.New("NodeController timed out while waiting for informers to sync..."))
			return
		}

		// Incorporate the results of node status pushed from kubelet to master.
		go wait.Until(func() {
			if err := nc.monitorNodeStatus(); err != nil {
				glog.Errorf("Error monitoring node status: %v", err)
			}
		}, nc.nodeMonitorPeriod, wait.NeverStop)

		// Managing eviction of nodes:
		// 1. when we delete pods off a node, if the node was not empty at the time we then
		//    queue a termination watcher
		//    a. If we hit an error, retry deletion
		// 2. The terminator loop ensures that pods are eventually cleaned and we never
		//    terminate a pod in a time period less than nc.maximumGracePeriod. AddedAt
		//    is the time from which we measure "has this pod been terminating too long",
		//    after which we will delete the pod with grace period 0 (force delete).
		//    a. If we hit errors, retry instantly
		//    b. If there are no pods left terminating, exit
		//    c. If there are pods still terminating, wait for their estimated completion
		//       before retrying
		go wait.Until(func() {
			nc.evictorLock.Lock()
			defer nc.evictorLock.Unlock()
			for k := range nc.zonePodEvictor {
				nc.zonePodEvictor[k].Try(func(value TimedValue) (bool, time.Duration) {
					obj, exists, err := nc.nodeStore.GetByKey(value.Value)
					if err != nil {
						glog.Warningf("Failed to get Node %v from the nodeStore: %v", value.Value, err)
					} else if !exists {
						glog.Warningf("Node %v no longer present in nodeStore!", value.Value)
					} else {
						node, _ := obj.(*api.Node)
						zone := utilnode.GetZoneKey(node)
						EvictionsNumber.WithLabelValues(zone).Inc()
					}

					nodeUid, _ := value.UID.(string)
					remaining, err := deletePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, nc.daemonSetStore)
					if err != nil {
						utilruntime.HandleError(fmt.Errorf("unable to evict node %q: %v", value.Value, err))
						return false, 0
					}

					if remaining {
						nc.zoneTerminationEvictor[k].Add(value.Value, value.UID)
					}
					return true, 0
				})
			}
		}, nodeEvictionPeriod, wait.NeverStop)

		// TODO: replace with a controller that ensures pods that are terminating complete
		// in a particular time period
		go wait.Until(func() {
			nc.evictorLock.Lock()
			defer nc.evictorLock.Unlock()
			for k := range nc.zoneTerminationEvictor {
				nc.zoneTerminationEvictor[k].Try(func(value TimedValue) (bool, time.Duration) {
					nodeUid, _ := value.UID.(string)
					completed, remaining, err := terminatePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, value.AddedAt, nc.maximumGracePeriod)
					if err != nil {
						utilruntime.HandleError(fmt.Errorf("unable to terminate pods on node %q: %v", value.Value, err))
						return false, 0
					}

					if completed {
						glog.V(2).Infof("All pods terminated on %s", value.Value)
						recordNodeEvent(nc.recorder, value.Value, nodeUid, api.EventTypeNormal, "TerminatedAllPods", fmt.Sprintf("Terminated all Pods on Node %s.", value.Value))
						return true, 0
					}

					glog.V(2).Infof("Pods terminating since %s on %q, estimated completion %s", value.AddedAt, value.Value, remaining)
					// clamp very short intervals
					if remaining < nodeEvictionPeriod {
						remaining = nodeEvictionPeriod
					}
					return false, remaining
				})
			}
		}, nodeEvictionPeriod, wait.NeverStop)

		go wait.Until(func() {
			pods, err := nc.podStore.List(labels.Everything())
			if err != nil {
				utilruntime.HandleError(err)
				return
			}
			cleanupOrphanedPods(pods, nc.nodeStore.Store, nc.forcefullyDeletePod)
		}, 30*time.Second, wait.NeverStop)
	}()
}
// CalculateSpreadPriority spreads pods across hosts and zones, considering pods belonging to the same service or replication controller.
// When a pod is scheduled, it looks for services or RCs that match the pod, then finds existing pods that match those selectors.
// It favors nodes that have fewer existing matching pods.
// i.e. it pushes the scheduler towards a node where there's the smallest number of
// pods which match the same service selectors or RC selectors as the pod being scheduled.
// Where zone information is included on the nodes, it favors nodes in zones with fewer existing matching pods.
func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodes []*api.Node) (schedulerapi.HostPriorityList, error) {
	selectors := make([]labels.Selector, 0, 3)
	if services, err := s.serviceLister.GetPodServices(pod); err == nil {
		for _, service := range services {
			selectors = append(selectors, labels.SelectorFromSet(service.Spec.Selector))
		}
	}
	if rcs, err := s.controllerLister.GetPodControllers(pod); err == nil {
		for _, rc := range rcs {
			selectors = append(selectors, labels.SelectorFromSet(rc.Spec.Selector))
		}
	}
	if rss, err := s.replicaSetLister.GetPodReplicaSets(pod); err == nil {
		for _, rs := range rss {
			if selector, err := unversioned.LabelSelectorAsSelector(rs.Spec.Selector); err == nil {
				selectors = append(selectors, selector)
			}
		}
	}

	// Count similar pods by node
	countsByNodeName := make(map[string]float32, len(nodes))
	countsByZone := make(map[string]float32, 10)
	maxCountByNodeName := float32(0)
	countsByNodeNameLock := sync.Mutex{}

	if len(selectors) > 0 {
		processNodeFunc := func(i int) {
			nodeName := nodes[i].Name
			count := float32(0)
			for _, nodePod := range nodeNameToInfo[nodeName].Pods() {
				if pod.Namespace != nodePod.Namespace {
					continue
				}
				// When we are replacing a failed pod, we often see the previous
				// deleted version while scheduling the replacement.
				// Ignore the previous deleted version for spreading purposes
				// (it can still be considered for resource restrictions etc.)
				if nodePod.DeletionTimestamp != nil {
					glog.V(4).Infof("skipping pending-deleted pod: %s/%s", nodePod.Namespace, nodePod.Name)
					continue
				}
				matches := false
				for _, selector := range selectors {
					if selector.Matches(labels.Set(nodePod.ObjectMeta.Labels)) {
						matches = true
						break
					}
				}
				if matches {
					count++
				}
			}
			zoneId := utilnode.GetZoneKey(nodes[i])

			countsByNodeNameLock.Lock()
			defer countsByNodeNameLock.Unlock()
			countsByNodeName[nodeName] = count
			if count > maxCountByNodeName {
				maxCountByNodeName = count
			}
			if zoneId != "" {
				countsByZone[zoneId] += count
			}
		}
		workqueue.Parallelize(16, len(nodes), processNodeFunc)
	}

	// Aggregate by-zone information
	// Compute the maximum number of pods hosted in any zone
	haveZones := len(countsByZone) != 0
	maxCountByZone := float32(0)
	for _, count := range countsByZone {
		if count > maxCountByZone {
			maxCountByZone = count
		}
	}

	result := make(schedulerapi.HostPriorityList, 0, len(nodes))
	//score int - scale of 0-maxPriority
	// 0 being the lowest priority and maxPriority being the highest
	for _, node := range nodes {
		// initializing to the default/max node score of maxPriority
		fScore := maxPriority
		if maxCountByNodeName > 0 {
			fScore = maxPriority * ((maxCountByNodeName - countsByNodeName[node.Name]) / maxCountByNodeName)
		}

		// If there is zone information present, incorporate it
		if haveZones {
			zoneId := utilnode.GetZoneKey(node)
			if zoneId != "" {
				zoneScore := maxPriority * ((maxCountByZone - countsByZone[zoneId]) / maxCountByZone)
				fScore = (fScore * (1.0 - zoneWeighting)) + (zoneWeighting * zoneScore)
			}
		}

		result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
		if glog.V(10) {
			// We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is
			// not logged. There is visible performance gain from it.
			glog.V(10).Infof(
				"%v -> %v: SelectorSpreadPriority, Score: (%d)", pod.Name, node.Name, int(fScore),
			)
		}
	}
	return result, nil
}
Example #11
0
// monitorNodeStatus verifies node status are constantly updated by kubelet, and if not,
// post "NodeReady==ConditionUnknown". It also evicts all pods if node is not ready or
// not reachable for a long period of time.
func (nc *NodeController) monitorNodeStatus() error {
	nodes, err := nc.kubeClient.Core().Nodes().List(api.ListOptions{})
	if err != nil {
		return err
	}
	added, deleted := nc.checkForNodeAddedDeleted(nodes)
	for i := range added {
		glog.V(1).Infof("NodeController observed a new Node: %#v", added[i].Name)
		recordNodeEvent(nc.recorder, added[i].Name, api.EventTypeNormal, "RegisteredNode", fmt.Sprintf("Registered Node %v in NodeController", added[i].Name))
		nc.knownNodeSet[added[i].Name] = added[i]
		// When adding new Nodes we need to check if new zone appeared, and if so add new evictor.
		zone := utilnode.GetZoneKey(added[i])
		if _, found := nc.zonePodEvictor[zone]; !found {
			nc.zonePodEvictor[zone] =
				NewRateLimitedTimedQueue(
					flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst))
		}
		if _, found := nc.zoneTerminationEvictor[zone]; !found {
			nc.zoneTerminationEvictor[zone] = NewRateLimitedTimedQueue(
				flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst))
		}
		nc.cancelPodEviction(added[i])
	}

	for i := range deleted {
		glog.V(1).Infof("NodeController observed a Node deletion: %v", deleted[i].Name)
		recordNodeEvent(nc.recorder, deleted[i].Name, api.EventTypeNormal, "RemovingNode", fmt.Sprintf("Removing Node %v from NodeController", deleted[i].Name))
		nc.evictPods(deleted[i])
		delete(nc.knownNodeSet, deleted[i].Name)
	}

	zoneToNodeConditions := map[string][]*api.NodeCondition{}
	for i := range nodes.Items {
		var gracePeriod time.Duration
		var observedReadyCondition api.NodeCondition
		var currentReadyCondition *api.NodeCondition
		node := &nodes.Items[i]
		for rep := 0; rep < nodeStatusUpdateRetry; rep++ {
			gracePeriod, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeStatus(node)
			if err == nil {
				break
			}
			name := node.Name
			node, err = nc.kubeClient.Core().Nodes().Get(name)
			if err != nil {
				glog.Errorf("Failed while getting a Node to retry updating NodeStatus. Probably Node %s was deleted.", name)
				break
			}
		}
		if err != nil {
			glog.Errorf("Update status  of Node %v from NodeController exceeds retry count."+
				"Skipping - no pods will be evicted.", node.Name)
			continue
		}
		// We do not treat a master node as a part of the cluster for network segmentation checking.
		if !system.IsMasterNode(node) {
			zoneToNodeConditions[utilnode.GetZoneKey(node)] = append(zoneToNodeConditions[utilnode.GetZoneKey(node)], currentReadyCondition)
		}

		decisionTimestamp := nc.now()
		if currentReadyCondition != nil {
			// Check eviction timeout against decisionTimestamp
			if observedReadyCondition.Status == api.ConditionFalse &&
				decisionTimestamp.After(nc.nodeStatusMap[node.Name].readyTransitionTimestamp.Add(nc.podEvictionTimeout)) {
				if nc.evictPods(node) {
					glog.V(4).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout)
				}
			}
			if observedReadyCondition.Status == api.ConditionUnknown &&
				decisionTimestamp.After(nc.nodeStatusMap[node.Name].probeTimestamp.Add(nc.podEvictionTimeout)) {
				if nc.evictPods(node) {
					glog.V(4).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout-gracePeriod)
				}
			}
			if observedReadyCondition.Status == api.ConditionTrue {
				if nc.cancelPodEviction(node) {
					glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name)
				}
			}

			// Report node event.
			if currentReadyCondition.Status != api.ConditionTrue && observedReadyCondition.Status == api.ConditionTrue {
				recordNodeStatusChange(nc.recorder, node, "NodeNotReady")
				if err = markAllPodsNotReady(nc.kubeClient, node.Name); err != nil {
					utilruntime.HandleError(fmt.Errorf("Unable to mark all pods NotReady on node %v: %v", node.Name, err))
				}
			}

			// Check with the cloud provider to see if the node still exists. If it
			// doesn't, delete the node immediately.
			if currentReadyCondition.Status != api.ConditionTrue && nc.cloud != nil {
				exists, err := nc.nodeExistsInCloudProvider(node.Name)
				if err != nil {
					glog.Errorf("Error determining if node %v exists in cloud: %v", node.Name, err)
					continue
				}
				if !exists {
					glog.V(2).Infof("Deleting node (no longer present in cloud provider): %s", node.Name)
					recordNodeEvent(nc.recorder, node.Name, api.EventTypeNormal, "DeletingNode", fmt.Sprintf("Deleting Node %v because it's not present according to cloud provider", node.Name))
					go func(nodeName string) {
						defer utilruntime.HandleCrash()
						// Kubelet is not reporting and Cloud Provider says node
						// is gone. Delete it without worrying about grace
						// periods.
						if err := forcefullyDeleteNode(nc.kubeClient, nodeName, nc.forcefullyDeletePod); err != nil {
							glog.Errorf("Unable to forcefully delete node %q: %v", nodeName, err)
						}
					}(node.Name)
					continue
				}
			}
		}
	}

	for k, v := range zoneToNodeConditions {
		newState := nc.computeZoneStateFunc(v)
		if newState == nc.zoneStates[k] {
			continue
		}
		if newState == stateFullSegmentation {
			glog.V(2).Infof("NodeController is entering network segmentation mode in zone %v.", k)
		} else if newState == stateNormal {
			glog.V(2).Infof("NodeController exited network segmentation mode in zone %v.", k)
		}
		for i := range nodes.Items {
			if utilnode.GetZoneKey(&nodes.Items[i]) == k {
				if newState == stateFullSegmentation {
					// When zone is fully segmented we stop the eviction all together.
					nc.cancelPodEviction(&nodes.Items[i])
				}
				if newState == stateNormal && nc.zoneStates[k] == stateFullSegmentation {
					// When exiting segmentation mode update probe timestamps on all Nodes.
					now := nc.now()
					v := nc.nodeStatusMap[nodes.Items[i].Name]
					v.probeTimestamp = now
					v.readyTransitionTimestamp = now
					nc.nodeStatusMap[nodes.Items[i].Name] = v
				}
			}
		}
		nc.zoneStates[k] = newState
	}

	return nil
}
Example #12
0
// CalculateSpreadPriority spreads pods across hosts and zones, considering pods belonging to the same service or replication controller.
// When a pod is scheduled, it looks for services or RCs that match the pod, then finds existing pods that match those selectors.
// It favors nodes that have fewer existing matching pods.
// i.e. it pushes the scheduler towards a node where there's the smallest number of
// pods which match the same service selectors or RC selectors as the pod being scheduled.
// Where zone information is included on the nodes, it favors nodes in zones with fewer existing matching pods.
func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
	selectors := make([]labels.Selector, 0)
	services, err := s.serviceLister.GetPodServices(pod)
	if err == nil {
		for _, service := range services {
			selectors = append(selectors, labels.SelectorFromSet(service.Spec.Selector))
		}
	}
	rcs, err := s.controllerLister.GetPodControllers(pod)
	if err == nil {
		for _, rc := range rcs {
			selectors = append(selectors, labels.SelectorFromSet(rc.Spec.Selector))
		}
	}
	rss, err := s.replicaSetLister.GetPodReplicaSets(pod)
	if err == nil {
		for _, rs := range rss {
			if selector, err := unversioned.LabelSelectorAsSelector(rs.Spec.Selector); err == nil {
				selectors = append(selectors, selector)
			}
		}
	}

	nodes, err := nodeLister.List()
	if err != nil {
		return nil, err
	}

	// Count similar pods by node
	countsByNodeName := map[string]int{}
	countsByNodeNameLock := sync.Mutex{}

	if len(selectors) > 0 {
		// Create a number of go-routines that will be computing number
		// of "similar" pods for given nodes.
		workers := 16
		toProcess := make(chan string, len(nodes))
		for i := range nodes {
			toProcess <- nodes[i].Name
		}
		close(toProcess)

		// TODO: Use Parallelize.
		wg := sync.WaitGroup{}
		wg.Add(workers)
		for i := 0; i < workers; i++ {
			go func() {
				defer utilruntime.HandleCrash()
				defer wg.Done()
				for {
					nodeName, ok := <-toProcess
					if !ok {
						return
					}
					count := 0
					for _, nodePod := range nodeNameToInfo[nodeName].Pods() {
						if pod.Namespace != nodePod.Namespace {
							continue
						}
						// When we are replacing a failed pod, we often see the previous
						// deleted version while scheduling the replacement.
						// Ignore the previous deleted version for spreading purposes
						// (it can still be considered for resource restrictions etc.)
						if nodePod.DeletionTimestamp != nil {
							glog.V(4).Infof("skipping pending-deleted pod: %s/%s", nodePod.Namespace, nodePod.Name)
							continue
						}
						matches := false
						for _, selector := range selectors {
							if selector.Matches(labels.Set(nodePod.ObjectMeta.Labels)) {
								matches = true
								break
							}
						}
						if matches {
							count++
						}
					}

					func() {
						countsByNodeNameLock.Lock()
						defer countsByNodeNameLock.Unlock()
						countsByNodeName[nodeName] = count
					}()
				}
			}()
		}
		wg.Wait()
	}

	// Aggregate by-node information
	// Compute the maximum number of pods hosted on any node
	maxCountByNodeName := 0
	for _, count := range countsByNodeName {
		if count > maxCountByNodeName {
			maxCountByNodeName = count
		}
	}

	// Count similar pods by zone, if zone information is present
	countsByZone := map[string]int{}
	for _, node := range nodes {
		count, found := countsByNodeName[node.Name]
		if !found {
			continue
		}

		zoneId := utilnode.GetZoneKey(node)
		if zoneId == "" {
			continue
		}

		countsByZone[zoneId] += count
	}

	// Aggregate by-zone information
	// Compute the maximum number of pods hosted in any zone
	haveZones := len(countsByZone) != 0
	maxCountByZone := 0
	for _, count := range countsByZone {
		if count > maxCountByZone {
			maxCountByZone = count
		}
	}

	result := make(schedulerapi.HostPriorityList, 0, len(nodes))
	//score int - scale of 0-maxPriority
	// 0 being the lowest priority and maxPriority being the highest
	for _, node := range nodes {
		// initializing to the default/max node score of maxPriority
		fScore := float32(maxPriority)
		if maxCountByNodeName > 0 {
			fScore = maxPriority * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName))
		}

		// If there is zone information present, incorporate it
		if haveZones {
			zoneId := utilnode.GetZoneKey(node)
			if zoneId != "" {
				zoneScore := maxPriority * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone))
				fScore = (fScore * (1.0 - zoneWeighting)) + (zoneWeighting * zoneScore)
			}
		}

		result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
		glog.V(10).Infof(
			"%v -> %v: SelectorSpreadPriority, Score: (%d)", pod.Name, node.Name, int(fScore),
		)
	}
	return result, nil
}