Example #1
// When a pod is deleted, enqueue the job that manages the pod and update its expectations.
// obj could be an *api.Pod, or a DeletionFinalStateUnknown marker item.
func (jm *JobController) deletePod(obj interface{}) {
	pod, ok := obj.(*api.Pod)

	// When a delete is dropped, the relist will notice a pod in the store not
	// in the list, leading to the insertion of a tombstone object which contains
	// the deleted key/value. Note that this value might be stale. If the pod
	// changed labels the new job will not be woken up till the periodic resync.
	if !ok {
		tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
		if !ok {
			glog.Errorf("Couldn't get object from tombstone %+v", obj)
		pod, ok = tombstone.Obj.(*api.Pod)
		if !ok {
			glog.Errorf("Tombstone contained object that is not a pod %+v", obj)
	if job := jm.getPodJob(pod); job != nil {
		jobKey, err := controller.KeyFunc(job)
		if err != nil {
			glog.Errorf("Couldn't get key for job %#v: %v", job, err)
Example #2
func (dsc *DaemonSetsController) deletePod(obj interface{}) {
	pod, ok := obj.(*api.Pod)
	// When a delete is dropped, the relist will notice a pod in the store not
	// in the list, leading to the insertion of a tombstone object which contains
	// the deleted key/value. Note that this value might be stale. If the pod
	// changed labels the new daemonset will not be woken up till the periodic
	// resync.
	if !ok {
		tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
		if !ok {
			glog.Errorf("Couldn't get object from tombstone %+v", obj)
		pod, ok = tombstone.Obj.(*api.Pod)
		if !ok {
			glog.Errorf("Tombstone contained object that is not a pod %+v", obj)
	glog.V(4).Infof("Pod %s deleted.", pod.Name)
	if ds := dsc.getPodDaemonSet(pod); ds != nil {
		dsKey, err := controller.KeyFunc(ds)
		if err != nil {
			glog.Errorf("Couldn't get key for object %+v: %v", ds, err)
Example #3
// When a pod is deleted, enqueue the replica set that manages the pod and update its expectations.
// obj could be an *api.Pod, or a DeletionFinalStateUnknown marker item.
func (rsc *ReplicaSetController) deletePod(obj interface{}) {
	pod, ok := obj.(*api.Pod)

	// When a delete is dropped, the relist will notice a pod in the store not
	// in the list, leading to the insertion of a tombstone object which contains
	// the deleted key/value. Note that this value might be stale. If the pod
	// changed labels the new ReplicaSet will not be woken up till the periodic resync.
	if !ok {
		tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
		if !ok {
			glog.Errorf("Couldn't get object from tombstone %+v", obj)
		pod, ok = tombstone.Obj.(*api.Pod)
		if !ok {
			glog.Errorf("Tombstone contained object that is not a pod %+v", obj)
	glog.V(4).Infof("Pod %s/%s deleted through %v, timestamp %+v: %+v.", pod.Namespace, pod.Name, utilruntime.GetCaller(), pod.DeletionTimestamp, pod)
	if rs := rsc.getPodReplicaSet(pod); rs != nil {
		rsKey, err := controller.KeyFunc(rs)
		if err != nil {
			glog.Errorf("Couldn't get key for ReplicaSet %#v: %v", rs, err)
		rsc.expectations.DeletionObserved(rsKey, controller.PodKey(pod))
func getKey(d *exp.Deployment, t *testing.T) string {
	if key, err := controller.KeyFunc(d); err != nil {
		t.Errorf("Unexpected error getting key for deployment %v: %v", d.Name, err)
		return ""
	} else {
		return key
Example #5
// enqueuePetSet enqueues the given petset in the work queue.
func (psc *PetSetController) enqueuePetSet(obj interface{}) {
	key, err := controller.KeyFunc(obj)
	if err != nil {
		glog.Errorf("Cound't get key for object %+v: %v", obj, err)
// obj could be an *api.ResourceQuota, or a DeletionFinalStateUnknown marker item.
func (rq *ResourceQuotaController) enqueueResourceQuota(obj interface{}) {
	key, err := controller.KeyFunc(obj)
	if err != nil {
		glog.Errorf("Couldn't get key for object %+v: %v", obj, err)
Example #7
func (dsc *DaemonSetsController) enqueueDaemonSet(ds *extensions.DaemonSet) {
	key, err := controller.KeyFunc(ds)
	if err != nil {
		glog.Errorf("Couldn't get key for object %+v: %v", ds, err)

	// TODO: Handle overlapping controllers better. See comment in ReplicationManager.
Example #8
func (dsc *DaemonSetsController) addPod(obj interface{}) {
	pod := obj.(*api.Pod)
	glog.V(4).Infof("Pod %s added.", pod.Name)
	if ds := dsc.getPodDaemonSet(pod); ds != nil {
		dsKey, err := controller.KeyFunc(ds)
		if err != nil {
			glog.Errorf("Couldn't get key for object %+v: %v", ds, err)
// pcbKeyFunc computes the key for a given pcb.
// If it's given a key, it simply returns it.
func pcbKeyFunc(obj interface{}) (string, error) {
	if key, ok := obj.(string); ok {
		return key, nil
	p, ok := obj.(*pcb)
	if !ok {
		return "", fmt.Errorf("not a valid pet control block %+v", p)
	if p.parent == nil {
		return "", fmt.Errorf("cannot compute pet control block key without parent pointer %+v", p)
	return controller.KeyFunc(p.parent)
Example #10
// Get returns a previously recorded blocking pet for the given petset.
func (u *unhealthyPetTracker) Get(ps *apps.PetSet, knownPets []*api.Pod) (*pcb, error) {
	defer u.storeLock.Unlock()

	// We "Get" by key but "Add" by object because the store interface doesn't
	// allow us to Get/Add a related obj (eg petset: blocking pet).
	key, err := controller.KeyFunc(ps)
	if err != nil {
		return nil, err
	obj, exists, err := u.store.GetByKey(key)
	if err != nil {
		return nil, err

	hc := defaultPetHealthChecker{}
	// There's no unhealthy pet blocking a scale event, but this might be
	// a controller manager restart. If it is, knownPets can be trusted.
	if !exists {
		for _, p := range knownPets {
			if hc.isHealthy(p) && !hc.isDying(p) {
				glog.V(4).Infof("Ignoring healthy pet %v for PetSet %v", p.Name, ps.Name)
			glog.Infof("No recorded blocking pet, but found unhealty pet %v for PetSet %v", p.Name, ps.Name)
			return &pcb{pod: p, parent: ps}, nil
		return nil, nil

	// This is a pet that's blocking further creates/deletes of a petset. If it
	// disappears, it's no longer blocking. If it exists, it continues to block
	// till it turns healthy or disappears.
	bp := obj.(*pcb)
	blockingPet, exists, err := u.pc.Get(bp)
	if err != nil {
		return nil, err
	if !exists {
		glog.V(4).Infof("Clearing blocking pet %v for PetSet %v because it's been deleted", bp.pod.Name, ps.Name)
		return nil, nil
	blockingPetPod := blockingPet.pod
	if hc.isHealthy(blockingPetPod) && !hc.isDying(blockingPetPod) {
		glog.V(4).Infof("Clearing blocking pet %v for PetSet %v because it's healthy", bp.pod.Name, ps.Name)
		blockingPet = nil
	return blockingPet, nil
Example #11
func (dsc *DaemonSetsController) syncDaemonSet(key string) error {
	startTime := time.Now()
	defer func() {
		glog.V(4).Infof("Finished syncing daemon set %q (%v)", key, time.Now().Sub(startTime))

	if !dsc.podStoreSynced() {
		// Sleep so we give the pod reflector goroutine a chance to run.
		glog.Infof("Waiting for pods controller to sync, requeuing ds %v", key)
		return nil

	obj, exists, err := dsc.dsStore.Store.GetByKey(key)
	if err != nil {
		glog.Infof("Unable to retrieve ds %v from store: %v", key, err)
		return err
	if !exists {
		glog.V(3).Infof("daemon set has been deleted %v", key)
		return nil
	ds := obj.(*extensions.DaemonSet)

	everything := unversioned.LabelSelector{}
	if reflect.DeepEqual(ds.Spec.Selector, &everything) {
		dsc.eventRecorder.Eventf(ds, api.EventTypeWarning, "SelectingAll", "This daemon set is selecting all pods. A non-empty selector is required.")
		return nil

	// Don't process a daemon set until all its creations and deletions have been processed.
	// For example if daemon set foo asked for 3 new daemon pods in the previous call to manage,
	// then we do not want to call manage on foo until the daemon pods have been created.
	dsKey, err := controller.KeyFunc(ds)
	if err != nil {
		glog.Errorf("Couldn't get key for object %+v: %v", ds, err)
		return err
	dsNeedsSync := dsc.expectations.SatisfiedExpectations(dsKey)
	if dsNeedsSync {

	return nil
Example #12
// obj could be an *batch.Job, or a DeletionFinalStateUnknown marker item.
func (jm *JobController) enqueueController(obj interface{}) {
	key, err := controller.KeyFunc(obj)
	if err != nil {
		glog.Errorf("Couldn't get key for object %+v: %v", obj, err)

	// TODO: Handle overlapping controllers better. Either disallow them at admission time or
	// deterministically avoid syncing controllers that fight over pods. Currently, we only
	// ensure that the same controller is synced for a given pod. When we periodically relist
	// all controllers there will still be some replica instability. One way to handle this is
	// by querying the store for all controllers that this rc overlaps, as well as all
	// controllers that overlap this rc, and sorting them.
Example #13
// When a pod is created, enqueue the controller that manages it and update it's expectations.
func (jm *JobController) addPod(obj interface{}) {
	pod := obj.(*api.Pod)
	if pod.DeletionTimestamp != nil {
		// on a restart of the controller controller, it's possible a new pod shows up in a state that
		// is already pending deletion. Prevent the pod from being a creation observation.
	if job := jm.getPodJob(pod); job != nil {
		jobKey, err := controller.KeyFunc(job)
		if err != nil {
			glog.Errorf("Couldn't get key for job %#v: %v", job, err)
Example #14
// When a pod is created, enqueue the replica set that manages it and update it's expectations.
func (rsc *ReplicaSetController) addPod(obj interface{}) {
	pod := obj.(*api.Pod)
	glog.V(4).Infof("Pod %s created: %+v.", pod.Name, pod)

	rs := rsc.getPodReplicaSet(pod)
	if rs == nil {
	rsKey, err := controller.KeyFunc(rs)
	if err != nil {
		glog.Errorf("Couldn't get key for replica set %#v: %v", rs, err)
	if pod.DeletionTimestamp != nil {
		// on a restart of the controller manager, it's possible a new pod shows up in a state that
		// is already pending deletion. Prevent the pod from being a creation observation.
Example #15
// manageJob is the core method responsible for managing the number of running
// pods according to what is specified in the job.Spec.
func (jm *JobController) manageJob(activePods []*api.Pod, succeeded int32, job *batch.Job) int32 {
	var activeLock sync.Mutex
	active := int32(len(activePods))
	parallelism := *job.Spec.Parallelism
	jobKey, err := controller.KeyFunc(job)
	if err != nil {
		glog.Errorf("Couldn't get key for job %#v: %v", job, err)
		return 0

	if active > parallelism {
		diff := active - parallelism
		jm.expectations.ExpectDeletions(jobKey, int(diff))
		glog.V(4).Infof("Too many pods running job %q, need %d, deleting %d", jobKey, parallelism, diff)
		// Sort the pods in the order such that not-ready < ready, unscheduled
		// < scheduled, and pending < running. This ensures that we delete pods
		// in the earlier stages whenever possible.

		active -= diff
		wait := sync.WaitGroup{}
		for i := int32(0); i < diff; i++ {
			go func(ix int32) {
				defer wait.Done()
				if err := jm.podControl.DeletePod(job.Namespace, activePods[ix].Name, job); err != nil {
					defer utilruntime.HandleError(err)
					// Decrement the expected number of deletes because the informer won't observe this deletion

	} else if active < parallelism {
		wantActive := int32(0)
		if job.Spec.Completions == nil {
			// Job does not specify a number of completions.  Therefore, number active
			// should be equal to parallelism, unless the job has seen at least
			// once success, in which leave whatever is running, running.
			if succeeded > 0 {
				wantActive = active
			} else {
				wantActive = parallelism
		} else {
			// Job specifies a specific number of completions.  Therefore, number
			// active should not ever exceed number of remaining completions.
			wantActive = *job.Spec.Completions - succeeded
			if wantActive > parallelism {
				wantActive = parallelism
		diff := wantActive - active
		if diff < 0 {
			glog.Errorf("More active than wanted: job %q, want %d, have %d", jobKey, wantActive, active)
			diff = 0
		jm.expectations.ExpectCreations(jobKey, int(diff))
		glog.V(4).Infof("Too few pods running job %q, need %d, creating %d", jobKey, wantActive, diff)

		active += diff
		wait := sync.WaitGroup{}
		for i := int32(0); i < diff; i++ {
			go func() {
				defer wait.Done()
				if err := jm.podControl.CreatePods(job.Namespace, &job.Spec.Template, job); err != nil {
					defer utilruntime.HandleError(err)
					// Decrement the expected number of creates because the informer won't observe this pod

	return active
Example #16
// syncJob will sync the job with the given key if it has had its expectations fulfilled, meaning
// it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked
// concurrently with the same key.
func (jm *JobController) syncJob(key string) error {
	startTime := time.Now()
	defer func() {
		glog.V(4).Infof("Finished syncing job %q (%v)", key, time.Now().Sub(startTime))

	if !jm.podStoreSynced() {
		// Sleep so we give the pod reflector goroutine a chance to run.
		glog.V(4).Infof("Waiting for pods controller to sync, requeuing job %v", key)
		return nil

	obj, exists, err := jm.jobStore.Store.GetByKey(key)
	if !exists {
		glog.V(4).Infof("Job has been deleted: %v", key)
		return nil
	if err != nil {
		glog.Errorf("Unable to retrieve job %v from store: %v", key, err)
		return err
	job := *obj.(*batch.Job)

	// Check the expectations of the job before counting active pods, otherwise a new pod can sneak in
	// and update the expectations after we've retrieved active pods from the store. If a new pod enters
	// the store after we've checked the expectation, the job sync is just deferred till the next relist.
	jobKey, err := controller.KeyFunc(&job)
	if err != nil {
		glog.Errorf("Couldn't get key for job %#v: %v", job, err)
		return err
	jobNeedsSync := jm.expectations.SatisfiedExpectations(jobKey)
	selector, _ := unversioned.LabelSelectorAsSelector(job.Spec.Selector)
	podList, err := jm.podStore.Pods(job.Namespace).List(selector)
	if err != nil {
		glog.Errorf("Error getting pods for job %q: %v", key, err)
		return err

	activePods := controller.FilterActivePods(podList.Items)
	active := int32(len(activePods))
	succeeded, failed := getStatus(podList.Items)
	conditions := len(job.Status.Conditions)
	if job.Status.StartTime == nil {
		now := unversioned.Now()
		job.Status.StartTime = &now
	// if job was finished previously, we don't want to redo the termination
	if isJobFinished(&job) {
		return nil
	if pastActiveDeadline(&job) {
		// TODO: below code should be replaced with pod termination resulting in
		// pod failures, rather than killing pods. Unfortunately none such solution
		// exists ATM. There's an open discussion in the topic in
		// https://github.com/kubernetes/kubernetes/issues/14602 which might give
		// some sort of solution to above problem.
		// kill remaining active pods
		wait := sync.WaitGroup{}
		for i := int32(0); i < active; i++ {
			go func(ix int32) {
				defer wait.Done()
				if err := jm.podControl.DeletePod(job.Namespace, activePods[ix].Name, &job); err != nil {
					defer utilruntime.HandleError(err)
		// update status values accordingly
		failed += active
		active = 0
		job.Status.Conditions = append(job.Status.Conditions, newCondition(batch.JobFailed, "DeadlineExceeded", "Job was active longer than specified deadline"))
		jm.recorder.Event(&job, api.EventTypeNormal, "DeadlineExceeded", "Job was active longer than specified deadline")
	} else {
		if jobNeedsSync {
			active = jm.manageJob(activePods, succeeded, &job)
		completions := succeeded
		complete := false
		if job.Spec.Completions == nil {
			// This type of job is complete when any pod exits with success.
			// Each pod is capable of
			// determining whether or not the entire Job is done.  Subsequent pods are
			// not expected to fail, but if they do, the failure is ignored.  Once any
			// pod succeeds, the controller waits for remaining pods to finish, and
			// then the job is complete.
			if succeeded > 0 && active == 0 {
				complete = true
		} else {
			// Job specifies a number of completions.  This type of job signals
			// success by having that number of successes.  Since we do not
			// start more pods than there are remaining completions, there should
			// not be any remaining active pods once this count is reached.
			if completions >= *job.Spec.Completions {
				complete = true
				if active > 0 {
					jm.recorder.Event(&job, api.EventTypeWarning, "TooManyActivePods", "Too many active pods running after completion count reached")
				if completions > *job.Spec.Completions {
					jm.recorder.Event(&job, api.EventTypeWarning, "TooManySucceededPods", "Too many succeeded pods running after completion count reached")
		if complete {
			job.Status.Conditions = append(job.Status.Conditions, newCondition(batch.JobComplete, "", ""))
			now := unversioned.Now()
			job.Status.CompletionTime = &now

	// no need to update the job if the status hasn't changed since last time
	if job.Status.Active != active || job.Status.Succeeded != succeeded || job.Status.Failed != failed || len(job.Status.Conditions) != conditions {
		job.Status.Active = active
		job.Status.Succeeded = succeeded
		job.Status.Failed = failed

		if err := jm.updateHandler(&job); err != nil {
			glog.Errorf("Failed to update job %v, requeuing.  Error: %v", job.Name, err)
	return nil
Example #17
// manageReplicas checks and updates replicas for the given ReplicaSet.
func (rsc *ReplicaSetController) manageReplicas(filteredPods []*api.Pod, rs *extensions.ReplicaSet) {
	diff := len(filteredPods) - int(rs.Spec.Replicas)
	rsKey, err := controller.KeyFunc(rs)
	if err != nil {
		glog.Errorf("Couldn't get key for ReplicaSet %#v: %v", rs, err)
	if diff < 0 {
		diff *= -1
		if diff > rsc.burstReplicas {
			diff = rsc.burstReplicas
		// TODO: Track UIDs of creates just like deletes. The problem currently
		// is we'd need to wait on the result of a create to record the pod's
		// UID, which would require locking *across* the create, which will turn
		// into a performance bottleneck. We should generate a UID for the pod
		// beforehand and store it via ExpectCreations.
		rsc.expectations.ExpectCreations(rsKey, diff)
		wait := sync.WaitGroup{}
		glog.V(2).Infof("Too few %q/%q replicas, need %d, creating %d", rs.Namespace, rs.Name, rs.Spec.Replicas, diff)
		for i := 0; i < diff; i++ {
			go func() {
				defer wait.Done()
				if err := rsc.podControl.CreatePods(rs.Namespace, &rs.Spec.Template, rs); err != nil {
					// Decrement the expected number of creates because the informer won't observe this pod
					glog.V(2).Infof("Failed creation, decrementing expectations for replica set %q/%q", rs.Namespace, rs.Name)
	} else if diff > 0 {
		if diff > rsc.burstReplicas {
			diff = rsc.burstReplicas
		glog.V(2).Infof("Too many %q/%q replicas, need %d, deleting %d", rs.Namespace, rs.Name, rs.Spec.Replicas, diff)
		// No need to sort pods if we are about to delete all of them
		if rs.Spec.Replicas != 0 {
			// Sort the pods in the order such that not-ready < ready, unscheduled
			// < scheduled, and pending < running. This ensures that we delete pods
			// in the earlier stages whenever possible.
		// Snapshot the UIDs (ns/name) of the pods we're expecting to see
		// deleted, so we know to record their expectations exactly once either
		// when we see it as an update of the deletion timestamp, or as a delete.
		// Note that if the labels on a pod/rs change in a way that the pod gets
		// orphaned, the rs will only wake up after the expectations have
		// expired even if other pods are deleted.
		deletedPodKeys := []string{}
		for i := 0; i < diff; i++ {
			deletedPodKeys = append(deletedPodKeys, controller.PodKey(filteredPods[i]))
		rsc.expectations.ExpectDeletions(rsKey, deletedPodKeys)
		wait := sync.WaitGroup{}
		for i := 0; i < diff; i++ {
			go func(ix int) {
				defer wait.Done()
				if err := rsc.podControl.DeletePod(rs.Namespace, filteredPods[ix].Name, rs); err != nil {
					// Decrement the expected number of deletes because the informer won't observe this deletion
					podKey := controller.PodKey(filteredPods[ix])
					glog.V(2).Infof("Failed to delete %v, decrementing expectations for controller %q/%q", podKey, rs.Namespace, rs.Name)
					rsc.expectations.DeletionObserved(rsKey, podKey)
Example #18
// syncReplicaSet will sync the ReplicaSet with the given key if it has had its expectations fulfilled,
// meaning it did not expect to see any more of its pods created or deleted. This function is not meant to be
// invoked concurrently with the same key.
func (rsc *ReplicaSetController) syncReplicaSet(key string) error {
	startTime := time.Now()
	defer func() {
		glog.V(4).Infof("Finished syncing replica set %q (%v)", key, time.Now().Sub(startTime))

	if !rsc.podStoreSynced() {
		// Sleep so we give the pod reflector goroutine a chance to run.
		glog.Infof("Waiting for pods controller to sync, requeuing ReplicaSet %v", key)
		return nil

	obj, exists, err := rsc.rsStore.Store.GetByKey(key)
	if !exists {
		glog.Infof("ReplicaSet has been deleted %v", key)
		return nil
	if err != nil {
		glog.Infof("Unable to retrieve ReplicaSet %v from store: %v", key, err)
		return err
	rs := *obj.(*extensions.ReplicaSet)

	// Check the expectations of the ReplicaSet before counting active pods, otherwise a new pod can sneak
	// in and update the expectations after we've retrieved active pods from the store. If a new pod enters
	// the store after we've checked the expectation, the ReplicaSet sync is just deferred till the next
	// relist.
	rsKey, err := controller.KeyFunc(&rs)
	if err != nil {
		glog.Errorf("Couldn't get key for ReplicaSet %#v: %v", rs, err)
		return err
	rsNeedsSync := rsc.expectations.SatisfiedExpectations(rsKey)
	selector, err := unversioned.LabelSelectorAsSelector(rs.Spec.Selector)
	if err != nil {
		glog.Errorf("Error converting pod selector to selector: %v", err)
		return err
	podList, err := rsc.podStore.Pods(rs.Namespace).List(selector)
	if err != nil {
		glog.Errorf("Error getting pods for ReplicaSet %q: %v", key, err)
		return err

	// TODO: Do this in a single pass, or use an index.
	filteredPods := controller.FilterActivePods(podList.Items)
	if rsNeedsSync {
		rsc.manageReplicas(filteredPods, &rs)

	// Count the number of pods that have labels matching the labels of the pod
	// template of the replicaSet, the matching pods may have more labels than
	// are in the template. Because the label of podTemplateSpec is a superset
	// of the selector of the replicaset, so the possible matching pods must be
	// part of the filteredPods.
	fullyLabeledReplicasCount := 0
	templateLabel := labels.Set(rs.Spec.Template.Labels).AsSelector()
	for _, pod := range filteredPods {
		if templateLabel.Matches(labels.Set(pod.Labels)) {

	// Always updates status as pods come up or die.
	if err := updateReplicaCount(rsc.kubeClient.Extensions().ReplicaSets(rs.Namespace), rs, len(filteredPods), fullyLabeledReplicasCount); err != nil {
		// Multiple things could lead to this update failing. Requeuing the replica set ensures
		// we retry with some fairness.
		glog.V(2).Infof("Failed to update replica count for controller %v/%v; requeuing; error: %v", rs.Namespace, rs.Name, err)
	return nil
Example #19
func (dsc *DaemonSetsController) manage(ds *extensions.DaemonSet) {
	// Find out which nodes are running the daemon pods selected by ds.
	nodeToDaemonPods, err := dsc.getNodesToDaemonPods(ds)
	if err != nil {
		glog.Errorf("Error getting node to daemon pod mapping for daemon set %+v: %v", ds, err)

	// For each node, if the node is running the daemon pod but isn't supposed to, kill the daemon
	// pod. If the node is supposed to run the daemon pod, but isn't, create the daemon pod on the node.
	nodeList, err := dsc.nodeStore.List()
	if err != nil {
		glog.Errorf("Couldn't get list of nodes when syncing daemon set %+v: %v", ds, err)
	var nodesNeedingDaemonPods, podsToDelete []string
	for _, node := range nodeList.Items {
		shouldRun := dsc.nodeShouldRunDaemonPod(&node, ds)

		daemonPods, isRunning := nodeToDaemonPods[node.Name]

		switch {
		case shouldRun && !isRunning:
			// If daemon pod is supposed to be running on node, but isn't, create daemon pod.
			nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, node.Name)
		case shouldRun && len(daemonPods) > 1:
			// If daemon pod is supposed to be running on node, but more than 1 daemon pod is running, delete the excess daemon pods.
			// Sort the daemon pods by creation time, so the the oldest is preserved.
			for i := 1; i < len(daemonPods); i++ {
				podsToDelete = append(podsToDelete, daemonPods[i].Name)
		case !shouldRun && isRunning:
			// If daemon pod isn't supposed to run on node, but it is, delete all daemon pods on node.
			for i := range daemonPods {
				podsToDelete = append(podsToDelete, daemonPods[i].Name)

	// We need to set expectations before creating/deleting pods to avoid race conditions.
	dsKey, err := controller.KeyFunc(ds)
	if err != nil {
		glog.Errorf("Couldn't get key for object %+v: %v", ds, err)

	createDiff := len(nodesNeedingDaemonPods)
	deleteDiff := len(podsToDelete)

	if createDiff > dsc.burstReplicas {
		createDiff = dsc.burstReplicas
	if deleteDiff > dsc.burstReplicas {
		deleteDiff = dsc.burstReplicas

	dsc.expectations.SetExpectations(dsKey, createDiff, deleteDiff)

	glog.V(4).Infof("Nodes needing daemon pods for daemon set %s: %+v, creating %d", ds.Name, nodesNeedingDaemonPods, createDiff)
	createWait := sync.WaitGroup{}
	for i := 0; i < createDiff; i++ {
		go func(ix int) {
			defer createWait.Done()
			if err := dsc.podControl.CreatePodsOnNode(nodesNeedingDaemonPods[ix], ds.Namespace, &ds.Spec.Template, ds); err != nil {
				glog.V(2).Infof("Failed creation, decrementing expectations for set %q/%q", ds.Namespace, ds.Name)

	glog.V(4).Infof("Pods to delete for daemon set %s: %+v, deleting %d", ds.Name, podsToDelete, deleteDiff)
	deleteWait := sync.WaitGroup{}
	for i := 0; i < deleteDiff; i++ {
		go func(ix int) {
			defer deleteWait.Done()
			if err := dsc.podControl.DeletePod(ds.Namespace, podsToDelete[ix], ds); err != nil {
				glog.V(2).Infof("Failed deletion, decrementing expectations for set %q/%q", ds.Namespace, ds.Name)