Example #1
0
// NewNodeController returns a new node controller to sync instances from cloudprovider.
// This method returns an error if it is unable to initialize the CIDR bitmap with
// podCIDRs it has already allocated to nodes. Since we don't allow podCIDR changes
// currently, this should be handled as a fatal error.
func NewNodeController(
	podInformer informers.PodInformer,
	nodeInformer informers.NodeInformer,
	daemonSetInformer informers.DaemonSetInformer,
	cloud cloudprovider.Interface,
	kubeClient clientset.Interface,
	podEvictionTimeout time.Duration,
	evictionLimiterQPS float32,
	secondaryEvictionLimiterQPS float32,
	largeClusterThreshold int32,
	unhealthyZoneThreshold float32,
	nodeMonitorGracePeriod time.Duration,
	nodeStartupGracePeriod time.Duration,
	nodeMonitorPeriod time.Duration,
	clusterCIDR *net.IPNet,
	serviceCIDR *net.IPNet,
	nodeCIDRMaskSize int,
	allocateNodeCIDRs bool) (*NodeController, error) {
	eventBroadcaster := record.NewBroadcaster()
	recorder := eventBroadcaster.NewRecorder(v1.EventSource{Component: "controllermanager"})
	eventBroadcaster.StartLogging(glog.Infof)
	if kubeClient != nil {
		glog.V(0).Infof("Sending events to api server.")
		eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.Core().Events("")})
	} else {
		glog.V(0).Infof("No api server defined - no events will be sent to API server.")
	}

	if kubeClient != nil && kubeClient.Core().RESTClient().GetRateLimiter() != nil {
		metrics.RegisterMetricAndTrackRateLimiterUsage("node_controller", kubeClient.Core().RESTClient().GetRateLimiter())
	}

	if allocateNodeCIDRs {
		if clusterCIDR == nil {
			glog.Fatal("NodeController: Must specify clusterCIDR if allocateNodeCIDRs == true.")
		}
		mask := clusterCIDR.Mask
		if maskSize, _ := mask.Size(); maskSize > nodeCIDRMaskSize {
			glog.Fatal("NodeController: Invalid clusterCIDR, mask size of clusterCIDR must be less than nodeCIDRMaskSize.")
		}
	}

	nc := &NodeController{
		cloud:                       cloud,
		knownNodeSet:                make(map[string]*v1.Node),
		kubeClient:                  kubeClient,
		recorder:                    recorder,
		podEvictionTimeout:          podEvictionTimeout,
		maximumGracePeriod:          5 * time.Minute,
		zonePodEvictor:              make(map[string]*RateLimitedTimedQueue),
		nodeStatusMap:               make(map[string]nodeStatusData),
		nodeMonitorGracePeriod:      nodeMonitorGracePeriod,
		nodeMonitorPeriod:           nodeMonitorPeriod,
		nodeStartupGracePeriod:      nodeStartupGracePeriod,
		lookupIP:                    net.LookupIP,
		now:                         metav1.Now,
		clusterCIDR:                 clusterCIDR,
		serviceCIDR:                 serviceCIDR,
		allocateNodeCIDRs:           allocateNodeCIDRs,
		forcefullyDeletePod:         func(p *v1.Pod) error { return forcefullyDeletePod(kubeClient, p) },
		nodeExistsInCloudProvider:   func(nodeName types.NodeName) (bool, error) { return nodeExistsInCloudProvider(cloud, nodeName) },
		evictionLimiterQPS:          evictionLimiterQPS,
		secondaryEvictionLimiterQPS: secondaryEvictionLimiterQPS,
		largeClusterThreshold:       largeClusterThreshold,
		unhealthyZoneThreshold:      unhealthyZoneThreshold,
		zoneStates:                  make(map[string]zoneState),
		podInformer:                 podInformer,
		nodeInformer:                nodeInformer,
		daemonSetInformer:           daemonSetInformer,
	}
	nc.enterPartialDisruptionFunc = nc.ReducedQPSFunc
	nc.enterFullDisruptionFunc = nc.HealthyQPSFunc
	nc.computeZoneStateFunc = nc.ComputeZoneState

	podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
		AddFunc:    nc.maybeDeleteTerminatingPod,
		UpdateFunc: func(_, obj interface{}) { nc.maybeDeleteTerminatingPod(obj) },
	})
	nc.podStore = *podInformer.Lister()

	nodeEventHandlerFuncs := cache.ResourceEventHandlerFuncs{}
	if nc.allocateNodeCIDRs {
		var nodeList *v1.NodeList
		var err error
		// We must poll because apiserver might not be up. This error causes
		// controller manager to restart.
		if pollErr := wait.Poll(10*time.Second, apiserverStartupGracePeriod, func() (bool, error) {
			nodeList, err = kubeClient.Core().Nodes().List(v1.ListOptions{
				FieldSelector: fields.Everything().String(),
				LabelSelector: labels.Everything().String(),
			})
			if err != nil {
				glog.Errorf("Failed to list all nodes: %v", err)
				return false, nil
			}
			return true, nil
		}); pollErr != nil {
			return nil, fmt.Errorf("Failed to list all nodes in %v, cannot proceed without updating CIDR map", apiserverStartupGracePeriod)
		}
		nc.cidrAllocator, err = NewCIDRRangeAllocator(kubeClient, clusterCIDR, serviceCIDR, nodeCIDRMaskSize, nodeList)
		if err != nil {
			return nil, err
		}

		nodeEventHandlerFuncs = cache.ResourceEventHandlerFuncs{
			AddFunc: func(originalObj interface{}) {
				obj, err := api.Scheme.DeepCopy(originalObj)
				if err != nil {
					utilruntime.HandleError(err)
					return
				}
				node := obj.(*v1.Node)

				if err := nc.cidrAllocator.AllocateOrOccupyCIDR(node); err != nil {
					utilruntime.HandleError(fmt.Errorf("Error allocating CIDR: %v", err))
				}
			},
			UpdateFunc: func(_, obj interface{}) {
				node := obj.(*v1.Node)
				// If the PodCIDR is not empty we either:
				// - already processed a Node that already had a CIDR after NC restarted
				//   (cidr is marked as used),
				// - already processed a Node successfully and allocated a CIDR for it
				//   (cidr is marked as used),
				// - already processed a Node but we did saw a "timeout" response and
				//   request eventually got through in this case we haven't released
				//   the allocated CIDR (cidr is still marked as used).
				// There's a possible error here:
				// - NC sees a new Node and assigns a CIDR X to it,
				// - Update Node call fails with a timeout,
				// - Node is updated by some other component, NC sees an update and
				//   assigns CIDR Y to the Node,
				// - Both CIDR X and CIDR Y are marked as used in the local cache,
				//   even though Node sees only CIDR Y
				// The problem here is that in in-memory cache we see CIDR X as marked,
				// which prevents it from being assigned to any new node. The cluster
				// state is correct.
				// Restart of NC fixes the issue.
				if node.Spec.PodCIDR == "" {
					nodeCopy, err := api.Scheme.Copy(node)
					if err != nil {
						utilruntime.HandleError(err)
						return
					}

					if err := nc.cidrAllocator.AllocateOrOccupyCIDR(nodeCopy.(*v1.Node)); err != nil {
						utilruntime.HandleError(fmt.Errorf("Error allocating CIDR: %v", err))
					}
				}
			},
			DeleteFunc: func(originalObj interface{}) {
				obj, err := api.Scheme.DeepCopy(originalObj)
				if err != nil {
					utilruntime.HandleError(err)
					return
				}

				node, isNode := obj.(*v1.Node)
				// We can get DeletedFinalStateUnknown instead of *v1.Node here and we need to handle that correctly. #34692
				if !isNode {
					deletedState, ok := obj.(cache.DeletedFinalStateUnknown)
					if !ok {
						glog.Errorf("Received unexpected object: %v", obj)
						return
					}
					node, ok = deletedState.Obj.(*v1.Node)
					if !ok {
						glog.Errorf("DeletedFinalStateUnknown contained non-Node object: %v", deletedState.Obj)
						return
					}
				}
				if err := nc.cidrAllocator.ReleaseCIDR(node); err != nil {
					glog.Errorf("Error releasing CIDR: %v", err)
				}
			},
		}
	}

	nodeInformer.Informer().AddEventHandler(nodeEventHandlerFuncs)
	nc.nodeStore = *nodeInformer.Lister()

	nc.daemonSetStore = *daemonSetInformer.Lister()

	return nc, nil
}
func NewDaemonSetsController(daemonSetInformer informers.DaemonSetInformer, podInformer informers.PodInformer, nodeInformer informers.NodeInformer, kubeClient clientset.Interface, lookupCacheSize int) *DaemonSetsController {
	eventBroadcaster := record.NewBroadcaster()
	eventBroadcaster.StartLogging(glog.Infof)
	// TODO: remove the wrapper when every clients have moved to use the clientset.
	eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.Core().Events("")})

	if kubeClient != nil && kubeClient.Core().RESTClient().GetRateLimiter() != nil {
		metrics.RegisterMetricAndTrackRateLimiterUsage("daemon_controller", kubeClient.Core().RESTClient().GetRateLimiter())
	}
	dsc := &DaemonSetsController{
		kubeClient:    kubeClient,
		eventRecorder: eventBroadcaster.NewRecorder(v1.EventSource{Component: "daemonset-controller"}),
		podControl: controller.RealPodControl{
			KubeClient: kubeClient,
			Recorder:   eventBroadcaster.NewRecorder(v1.EventSource{Component: "daemon-set"}),
		},
		burstReplicas: BurstReplicas,
		expectations:  controller.NewControllerExpectations(),
		queue:         workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "daemonset"),
	}

	daemonSetInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
		AddFunc: func(obj interface{}) {
			ds := obj.(*extensions.DaemonSet)
			glog.V(4).Infof("Adding daemon set %s", ds.Name)
			dsc.enqueueDaemonSet(ds)
		},
		UpdateFunc: func(old, cur interface{}) {
			oldDS := old.(*extensions.DaemonSet)
			curDS := cur.(*extensions.DaemonSet)
			// We should invalidate the whole lookup cache if a DS's selector has been updated.
			//
			// Imagine that you have two RSs:
			// * old DS1
			// * new DS2
			// You also have a pod that is attached to DS2 (because it doesn't match DS1 selector).
			// Now imagine that you are changing DS1 selector so that it is now matching that pod,
			// in such case we must invalidate the whole cache so that pod could be adopted by DS1
			//
			// This makes the lookup cache less helpful, but selector update does not happen often,
			// so it's not a big problem
			if !reflect.DeepEqual(oldDS.Spec.Selector, curDS.Spec.Selector) {
				dsc.lookupCache.InvalidateAll()
			}

			glog.V(4).Infof("Updating daemon set %s", oldDS.Name)
			dsc.enqueueDaemonSet(curDS)
		},
		DeleteFunc: dsc.deleteDaemonset,
	})
	dsc.dsStore = daemonSetInformer.Lister()
	dsc.dsStoreSynced = daemonSetInformer.Informer().HasSynced

	// Watch for creation/deletion of pods. The reason we watch is that we don't want a daemon set to create/delete
	// more pods until all the effects (expectations) of a daemon set's create/delete have been observed.
	podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
		AddFunc:    dsc.addPod,
		UpdateFunc: dsc.updatePod,
		DeleteFunc: dsc.deletePod,
	})
	dsc.podStore = podInformer.Lister()
	dsc.podStoreSynced = podInformer.Informer().HasSynced

	nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
		AddFunc:    dsc.addNode,
		UpdateFunc: dsc.updateNode,
	},
	)
	dsc.nodeStoreSynced = nodeInformer.Informer().HasSynced
	dsc.nodeStore = nodeInformer.Lister()

	dsc.syncHandler = dsc.syncDaemonSet
	dsc.lookupCache = controller.NewMatchingCache(lookupCacheSize)
	return dsc
}