예제 #1
0
func NewMetricsGrabber(c *client.Client, kubelets bool, scheduler bool, controllers bool, apiServer bool) (*MetricsGrabber, error) {
	registeredMaster := false
	masterName := ""
	nodeList, err := c.Nodes().List(api.ListOptions{})
	if err != nil {
		return nil, err
	}
	if len(nodeList.Items) < 1 {
		glog.Warning("Can't find any Nodes in the API server to grab metrics from")
	}
	for _, node := range nodeList.Items {
		if system.IsMasterNode(&node) {
			registeredMaster = true
			masterName = node.Name
			break
		}
	}
	if !registeredMaster {
		scheduler = false
		controllers = false
		glog.Warningf("Master node is not registered. Grabbing metrics from Scheduler and ControllerManager is disabled.")
	}

	return &MetricsGrabber{
		client:                    c,
		grabFromApiServer:         apiServer,
		grabFromControllerManager: controllers,
		grabFromKubelets:          kubelets,
		grabFromScheduler:         scheduler,
		masterName:                masterName,
		registeredMaster:          registeredMaster,
	}, nil
}
예제 #2
0
// monitorNodeStatus verifies node status are constantly updated by kubelet, and if not,
// post "NodeReady==ConditionUnknown". It also evicts all pods if node is not ready or
// not reachable for a long period of time.
func (nc *NodeController) monitorNodeStatus() error {
	nodes, err := nc.kubeClient.Core().Nodes().List(api.ListOptions{})
	if err != nil {
		return err
	}
	for _, node := range nodes.Items {
		if !nc.knownNodeSet.Has(node.Name) {
			glog.V(1).Infof("NodeController observed a new Node: %#v", node)
			nc.recordNodeEvent(node.Name, api.EventTypeNormal, "RegisteredNode", fmt.Sprintf("Registered Node %v in NodeController", node.Name))
			nc.cancelPodEviction(node.Name)
			nc.knownNodeSet.Insert(node.Name)
		}
	}
	// If there's a difference between lengths of known Nodes and observed nodes
	// we must have removed some Node.
	if len(nc.knownNodeSet) != len(nodes.Items) {
		observedSet := make(sets.String)
		for _, node := range nodes.Items {
			observedSet.Insert(node.Name)
		}
		deleted := nc.knownNodeSet.Difference(observedSet)
		for nodeName := range deleted {
			glog.V(1).Infof("NodeController observed a Node deletion: %v", nodeName)
			nc.recordNodeEvent(nodeName, api.EventTypeNormal, "RemovingNode", fmt.Sprintf("Removing Node %v from NodeController", nodeName))
			nc.evictPods(nodeName)
			nc.knownNodeSet.Delete(nodeName)
		}
	}

	seenReady := false
	for i := range nodes.Items {
		var gracePeriod time.Duration
		var observedReadyCondition api.NodeCondition
		var currentReadyCondition *api.NodeCondition
		node := &nodes.Items[i]
		for rep := 0; rep < nodeStatusUpdateRetry; rep++ {
			gracePeriod, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeStatus(node)
			if err == nil {
				break
			}
			name := node.Name
			node, err = nc.kubeClient.Core().Nodes().Get(name)
			if err != nil {
				glog.Errorf("Failed while getting a Node to retry updating NodeStatus. Probably Node %s was deleted.", name)
				break
			}
		}
		if err != nil {
			glog.Errorf("Update status  of Node %v from NodeController exceeds retry count."+
				"Skipping - no pods will be evicted.", node.Name)
			continue
		}

		decisionTimestamp := nc.now()

		if currentReadyCondition != nil {
			// Check eviction timeout against decisionTimestamp
			if observedReadyCondition.Status == api.ConditionFalse &&
				decisionTimestamp.After(nc.nodeStatusMap[node.Name].readyTransitionTimestamp.Add(nc.podEvictionTimeout)) {
				if nc.evictPods(node.Name) {
					glog.V(4).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout)
				}
			}
			if observedReadyCondition.Status == api.ConditionUnknown &&
				decisionTimestamp.After(nc.nodeStatusMap[node.Name].probeTimestamp.Add(nc.podEvictionTimeout)) {
				if nc.evictPods(node.Name) {
					glog.V(4).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout-gracePeriod)
				}
			}
			if observedReadyCondition.Status == api.ConditionTrue {
				// We do not treat a master node as a part of the cluster for network segmentation checking.
				if !system.IsMasterNode(node) {
					seenReady = true
				}
				if nc.cancelPodEviction(node.Name) {
					glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name)
				}
			}

			// Report node event.
			if currentReadyCondition.Status != api.ConditionTrue && observedReadyCondition.Status == api.ConditionTrue {
				nc.recordNodeStatusChange(node, "NodeNotReady")
				if err = nc.markAllPodsNotReady(node.Name); err != nil {
					utilruntime.HandleError(fmt.Errorf("Unable to mark all pods NotReady on node %v: %v", node.Name, err))
				}
			}

			// Check with the cloud provider to see if the node still exists. If it
			// doesn't, delete the node immediately.
			if currentReadyCondition.Status != api.ConditionTrue && nc.cloud != nil {
				exists, err := nc.nodeExistsInCloudProvider(node.Name)
				if err != nil {
					glog.Errorf("Error determining if node %v exists in cloud: %v", node.Name, err)
					continue
				}
				if !exists {
					glog.V(2).Infof("Deleting node (no longer present in cloud provider): %s", node.Name)
					nc.recordNodeEvent(node.Name, api.EventTypeNormal, "DeletingNode", fmt.Sprintf("Deleting Node %v because it's not present according to cloud provider", node.Name))
					go func(nodeName string) {
						defer utilruntime.HandleCrash()
						// Kubelet is not reporting and Cloud Provider says node
						// is gone. Delete it without worrying about grace
						// periods.
						if err := nc.forcefullyDeleteNode(nodeName); err != nil {
							glog.Errorf("Unable to forcefully delete node %q: %v", nodeName, err)
						}
					}(node.Name)
					continue
				}
			}
		}
	}

	// NC don't see any Ready Node. We assume that the network is segmented and Nodes cannot connect to API server and
	// update their statuses. NC enteres network segmentation mode and cancels all evictions in progress.
	if !seenReady {
		nc.networkSegmentationMode = true
		nc.stopAllPodEvictions()
		glog.V(2).Info("NodeController is entering network segmentation mode.")
	} else {
		if nc.networkSegmentationMode {
			nc.forceUpdateAllProbeTimes()
			nc.networkSegmentationMode = false
			glog.V(2).Info("NodeController exited network segmentation mode.")
		}
	}
	return nil
}