// When a pod is deleted, enqueue the job that manages the pod and update its expectations. // obj could be an *api.Pod, or a DeletionFinalStateUnknown marker item. func (jm *JobController) deletePod(obj interface{}) { pod, ok := obj.(*api.Pod) // When a delete is dropped, the relist will notice a pod in the store not // in the list, leading to the insertion of a tombstone object which contains // the deleted key/value. Note that this value might be stale. If the pod // changed labels the new job will not be woken up till the periodic resync. if !ok { tombstone, ok := obj.(cache.DeletedFinalStateUnknown) if !ok { glog.Errorf("Couldn't get object from tombstone %+v", obj) return } pod, ok = tombstone.Obj.(*api.Pod) if !ok { glog.Errorf("Tombstone contained object that is not a pod %+v", obj) return } } if job := jm.getPodJob(pod); job != nil { jobKey, err := controller.KeyFunc(job) if err != nil { glog.Errorf("Couldn't get key for job %#v: %v", job, err) return } jm.expectations.DeletionObserved(jobKey) jm.enqueueController(job) } }
func (dsc *DaemonSetsController) deletePod(obj interface{}) { pod, ok := obj.(*api.Pod) // When a delete is dropped, the relist will notice a pod in the store not // in the list, leading to the insertion of a tombstone object which contains // the deleted key/value. Note that this value might be stale. If the pod // changed labels the new daemonset will not be woken up till the periodic // resync. if !ok { tombstone, ok := obj.(cache.DeletedFinalStateUnknown) if !ok { glog.Errorf("Couldn't get object from tombstone %+v", obj) return } pod, ok = tombstone.Obj.(*api.Pod) if !ok { glog.Errorf("Tombstone contained object that is not a pod %+v", obj) return } } glog.V(4).Infof("Pod %s deleted.", pod.Name) if ds := dsc.getPodDaemonSet(pod); ds != nil { dsKey, err := controller.KeyFunc(ds) if err != nil { glog.Errorf("Couldn't get key for object %+v: %v", ds, err) return } dsc.expectations.DeletionObserved(dsKey) dsc.enqueueDaemonSet(ds) } }
// When a pod is deleted, enqueue the replica set that manages the pod and update its expectations. // obj could be an *api.Pod, or a DeletionFinalStateUnknown marker item. func (rsc *ReplicaSetController) deletePod(obj interface{}) { pod, ok := obj.(*api.Pod) // When a delete is dropped, the relist will notice a pod in the store not // in the list, leading to the insertion of a tombstone object which contains // the deleted key/value. Note that this value might be stale. If the pod // changed labels the new ReplicaSet will not be woken up till the periodic resync. if !ok { tombstone, ok := obj.(cache.DeletedFinalStateUnknown) if !ok { glog.Errorf("Couldn't get object from tombstone %+v", obj) return } pod, ok = tombstone.Obj.(*api.Pod) if !ok { glog.Errorf("Tombstone contained object that is not a pod %+v", obj) return } } glog.V(4).Infof("Pod %s/%s deleted through %v, timestamp %+v: %+v.", pod.Namespace, pod.Name, utilruntime.GetCaller(), pod.DeletionTimestamp, pod) if rs := rsc.getPodReplicaSet(pod); rs != nil { rsKey, err := controller.KeyFunc(rs) if err != nil { glog.Errorf("Couldn't get key for ReplicaSet %#v: %v", rs, err) return } rsc.expectations.DeletionObserved(rsKey, controller.PodKey(pod)) rsc.enqueueReplicaSet(rs) } }
func getKey(d *exp.Deployment, t *testing.T) string { if key, err := controller.KeyFunc(d); err != nil { t.Errorf("Unexpected error getting key for deployment %v: %v", d.Name, err) return "" } else { return key } }
// enqueuePetSet enqueues the given petset in the work queue. func (psc *PetSetController) enqueuePetSet(obj interface{}) { key, err := controller.KeyFunc(obj) if err != nil { glog.Errorf("Cound't get key for object %+v: %v", obj, err) return } psc.queue.Add(key) }
// obj could be an *api.ResourceQuota, or a DeletionFinalStateUnknown marker item. func (rq *ResourceQuotaController) enqueueResourceQuota(obj interface{}) { key, err := controller.KeyFunc(obj) if err != nil { glog.Errorf("Couldn't get key for object %+v: %v", obj, err) return } rq.queue.Add(key) }
func (dsc *DaemonSetsController) enqueueDaemonSet(ds *extensions.DaemonSet) { key, err := controller.KeyFunc(ds) if err != nil { glog.Errorf("Couldn't get key for object %+v: %v", ds, err) return } // TODO: Handle overlapping controllers better. See comment in ReplicationManager. dsc.queue.Add(key) }
func (dsc *DaemonSetsController) addPod(obj interface{}) { pod := obj.(*api.Pod) glog.V(4).Infof("Pod %s added.", pod.Name) if ds := dsc.getPodDaemonSet(pod); ds != nil { dsKey, err := controller.KeyFunc(ds) if err != nil { glog.Errorf("Couldn't get key for object %+v: %v", ds, err) return } dsc.expectations.CreationObserved(dsKey) dsc.enqueueDaemonSet(ds) } }
// pcbKeyFunc computes the key for a given pcb. // If it's given a key, it simply returns it. func pcbKeyFunc(obj interface{}) (string, error) { if key, ok := obj.(string); ok { return key, nil } p, ok := obj.(*pcb) if !ok { return "", fmt.Errorf("not a valid pet control block %+v", p) } if p.parent == nil { return "", fmt.Errorf("cannot compute pet control block key without parent pointer %+v", p) } return controller.KeyFunc(p.parent) }
// Get returns a previously recorded blocking pet for the given petset. func (u *unhealthyPetTracker) Get(ps *apps.PetSet, knownPets []*api.Pod) (*pcb, error) { u.storeLock.Lock() defer u.storeLock.Unlock() // We "Get" by key but "Add" by object because the store interface doesn't // allow us to Get/Add a related obj (eg petset: blocking pet). key, err := controller.KeyFunc(ps) if err != nil { return nil, err } obj, exists, err := u.store.GetByKey(key) if err != nil { return nil, err } hc := defaultPetHealthChecker{} // There's no unhealthy pet blocking a scale event, but this might be // a controller manager restart. If it is, knownPets can be trusted. if !exists { for _, p := range knownPets { if hc.isHealthy(p) && !hc.isDying(p) { glog.V(4).Infof("Ignoring healthy pet %v for PetSet %v", p.Name, ps.Name) continue } glog.Infof("No recorded blocking pet, but found unhealty pet %v for PetSet %v", p.Name, ps.Name) return &pcb{pod: p, parent: ps}, nil } return nil, nil } // This is a pet that's blocking further creates/deletes of a petset. If it // disappears, it's no longer blocking. If it exists, it continues to block // till it turns healthy or disappears. bp := obj.(*pcb) blockingPet, exists, err := u.pc.Get(bp) if err != nil { return nil, err } if !exists { glog.V(4).Infof("Clearing blocking pet %v for PetSet %v because it's been deleted", bp.pod.Name, ps.Name) return nil, nil } blockingPetPod := blockingPet.pod if hc.isHealthy(blockingPetPod) && !hc.isDying(blockingPetPod) { glog.V(4).Infof("Clearing blocking pet %v for PetSet %v because it's healthy", bp.pod.Name, ps.Name) u.store.Delete(blockingPet) blockingPet = nil } return blockingPet, nil }
func (dsc *DaemonSetsController) syncDaemonSet(key string) error { startTime := time.Now() defer func() { glog.V(4).Infof("Finished syncing daemon set %q (%v)", key, time.Now().Sub(startTime)) }() if !dsc.podStoreSynced() { // Sleep so we give the pod reflector goroutine a chance to run. time.Sleep(PodStoreSyncedPollPeriod) glog.Infof("Waiting for pods controller to sync, requeuing ds %v", key) dsc.queue.Add(key) return nil } obj, exists, err := dsc.dsStore.Store.GetByKey(key) if err != nil { glog.Infof("Unable to retrieve ds %v from store: %v", key, err) dsc.queue.Add(key) return err } if !exists { glog.V(3).Infof("daemon set has been deleted %v", key) dsc.expectations.DeleteExpectations(key) return nil } ds := obj.(*extensions.DaemonSet) everything := unversioned.LabelSelector{} if reflect.DeepEqual(ds.Spec.Selector, &everything) { dsc.eventRecorder.Eventf(ds, api.EventTypeWarning, "SelectingAll", "This daemon set is selecting all pods. A non-empty selector is required.") return nil } // Don't process a daemon set until all its creations and deletions have been processed. // For example if daemon set foo asked for 3 new daemon pods in the previous call to manage, // then we do not want to call manage on foo until the daemon pods have been created. dsKey, err := controller.KeyFunc(ds) if err != nil { glog.Errorf("Couldn't get key for object %+v: %v", ds, err) return err } dsNeedsSync := dsc.expectations.SatisfiedExpectations(dsKey) if dsNeedsSync { dsc.manage(ds) } dsc.updateDaemonSetStatus(ds) return nil }
// obj could be an *batch.Job, or a DeletionFinalStateUnknown marker item. func (jm *JobController) enqueueController(obj interface{}) { key, err := controller.KeyFunc(obj) if err != nil { glog.Errorf("Couldn't get key for object %+v: %v", obj, err) return } // TODO: Handle overlapping controllers better. Either disallow them at admission time or // deterministically avoid syncing controllers that fight over pods. Currently, we only // ensure that the same controller is synced for a given pod. When we periodically relist // all controllers there will still be some replica instability. One way to handle this is // by querying the store for all controllers that this rc overlaps, as well as all // controllers that overlap this rc, and sorting them. jm.queue.Add(key) }
// When a pod is created, enqueue the controller that manages it and update it's expectations. func (jm *JobController) addPod(obj interface{}) { pod := obj.(*api.Pod) if pod.DeletionTimestamp != nil { // on a restart of the controller controller, it's possible a new pod shows up in a state that // is already pending deletion. Prevent the pod from being a creation observation. jm.deletePod(pod) return } if job := jm.getPodJob(pod); job != nil { jobKey, err := controller.KeyFunc(job) if err != nil { glog.Errorf("Couldn't get key for job %#v: %v", job, err) return } jm.expectations.CreationObserved(jobKey) jm.enqueueController(job) } }
// When a pod is created, enqueue the replica set that manages it and update it's expectations. func (rsc *ReplicaSetController) addPod(obj interface{}) { pod := obj.(*api.Pod) glog.V(4).Infof("Pod %s created: %+v.", pod.Name, pod) rs := rsc.getPodReplicaSet(pod) if rs == nil { return } rsKey, err := controller.KeyFunc(rs) if err != nil { glog.Errorf("Couldn't get key for replica set %#v: %v", rs, err) return } if pod.DeletionTimestamp != nil { // on a restart of the controller manager, it's possible a new pod shows up in a state that // is already pending deletion. Prevent the pod from being a creation observation. rsc.deletePod(pod) return } rsc.expectations.CreationObserved(rsKey) rsc.enqueueReplicaSet(rs) }
// manageJob is the core method responsible for managing the number of running // pods according to what is specified in the job.Spec. func (jm *JobController) manageJob(activePods []*api.Pod, succeeded int32, job *batch.Job) int32 { var activeLock sync.Mutex active := int32(len(activePods)) parallelism := *job.Spec.Parallelism jobKey, err := controller.KeyFunc(job) if err != nil { glog.Errorf("Couldn't get key for job %#v: %v", job, err) return 0 } if active > parallelism { diff := active - parallelism jm.expectations.ExpectDeletions(jobKey, int(diff)) glog.V(4).Infof("Too many pods running job %q, need %d, deleting %d", jobKey, parallelism, diff) // Sort the pods in the order such that not-ready < ready, unscheduled // < scheduled, and pending < running. This ensures that we delete pods // in the earlier stages whenever possible. sort.Sort(controller.ActivePods(activePods)) active -= diff wait := sync.WaitGroup{} wait.Add(int(diff)) for i := int32(0); i < diff; i++ { go func(ix int32) { defer wait.Done() if err := jm.podControl.DeletePod(job.Namespace, activePods[ix].Name, job); err != nil { defer utilruntime.HandleError(err) // Decrement the expected number of deletes because the informer won't observe this deletion jm.expectations.DeletionObserved(jobKey) activeLock.Lock() active++ activeLock.Unlock() } }(i) } wait.Wait() } else if active < parallelism { wantActive := int32(0) if job.Spec.Completions == nil { // Job does not specify a number of completions. Therefore, number active // should be equal to parallelism, unless the job has seen at least // once success, in which leave whatever is running, running. if succeeded > 0 { wantActive = active } else { wantActive = parallelism } } else { // Job specifies a specific number of completions. Therefore, number // active should not ever exceed number of remaining completions. wantActive = *job.Spec.Completions - succeeded if wantActive > parallelism { wantActive = parallelism } } diff := wantActive - active if diff < 0 { glog.Errorf("More active than wanted: job %q, want %d, have %d", jobKey, wantActive, active) diff = 0 } jm.expectations.ExpectCreations(jobKey, int(diff)) glog.V(4).Infof("Too few pods running job %q, need %d, creating %d", jobKey, wantActive, diff) active += diff wait := sync.WaitGroup{} wait.Add(int(diff)) for i := int32(0); i < diff; i++ { go func() { defer wait.Done() if err := jm.podControl.CreatePods(job.Namespace, &job.Spec.Template, job); err != nil { defer utilruntime.HandleError(err) // Decrement the expected number of creates because the informer won't observe this pod jm.expectations.CreationObserved(jobKey) activeLock.Lock() active-- activeLock.Unlock() } }() } wait.Wait() } return active }
// syncJob will sync the job with the given key if it has had its expectations fulfilled, meaning // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked // concurrently with the same key. func (jm *JobController) syncJob(key string) error { startTime := time.Now() defer func() { glog.V(4).Infof("Finished syncing job %q (%v)", key, time.Now().Sub(startTime)) }() if !jm.podStoreSynced() { // Sleep so we give the pod reflector goroutine a chance to run. time.Sleep(replicationcontroller.PodStoreSyncedPollPeriod) glog.V(4).Infof("Waiting for pods controller to sync, requeuing job %v", key) jm.queue.Add(key) return nil } obj, exists, err := jm.jobStore.Store.GetByKey(key) if !exists { glog.V(4).Infof("Job has been deleted: %v", key) jm.expectations.DeleteExpectations(key) return nil } if err != nil { glog.Errorf("Unable to retrieve job %v from store: %v", key, err) jm.queue.Add(key) return err } job := *obj.(*batch.Job) // Check the expectations of the job before counting active pods, otherwise a new pod can sneak in // and update the expectations after we've retrieved active pods from the store. If a new pod enters // the store after we've checked the expectation, the job sync is just deferred till the next relist. jobKey, err := controller.KeyFunc(&job) if err != nil { glog.Errorf("Couldn't get key for job %#v: %v", job, err) return err } jobNeedsSync := jm.expectations.SatisfiedExpectations(jobKey) selector, _ := unversioned.LabelSelectorAsSelector(job.Spec.Selector) podList, err := jm.podStore.Pods(job.Namespace).List(selector) if err != nil { glog.Errorf("Error getting pods for job %q: %v", key, err) jm.queue.Add(key) return err } activePods := controller.FilterActivePods(podList.Items) active := int32(len(activePods)) succeeded, failed := getStatus(podList.Items) conditions := len(job.Status.Conditions) if job.Status.StartTime == nil { now := unversioned.Now() job.Status.StartTime = &now } // if job was finished previously, we don't want to redo the termination if isJobFinished(&job) { return nil } if pastActiveDeadline(&job) { // TODO: below code should be replaced with pod termination resulting in // pod failures, rather than killing pods. Unfortunately none such solution // exists ATM. There's an open discussion in the topic in // https://github.com/kubernetes/kubernetes/issues/14602 which might give // some sort of solution to above problem. // kill remaining active pods wait := sync.WaitGroup{} wait.Add(int(active)) for i := int32(0); i < active; i++ { go func(ix int32) { defer wait.Done() if err := jm.podControl.DeletePod(job.Namespace, activePods[ix].Name, &job); err != nil { defer utilruntime.HandleError(err) } }(i) } wait.Wait() // update status values accordingly failed += active active = 0 job.Status.Conditions = append(job.Status.Conditions, newCondition(batch.JobFailed, "DeadlineExceeded", "Job was active longer than specified deadline")) jm.recorder.Event(&job, api.EventTypeNormal, "DeadlineExceeded", "Job was active longer than specified deadline") } else { if jobNeedsSync { active = jm.manageJob(activePods, succeeded, &job) } completions := succeeded complete := false if job.Spec.Completions == nil { // This type of job is complete when any pod exits with success. // Each pod is capable of // determining whether or not the entire Job is done. Subsequent pods are // not expected to fail, but if they do, the failure is ignored. Once any // pod succeeds, the controller waits for remaining pods to finish, and // then the job is complete. if succeeded > 0 && active == 0 { complete = true } } else { // Job specifies a number of completions. This type of job signals // success by having that number of successes. Since we do not // start more pods than there are remaining completions, there should // not be any remaining active pods once this count is reached. if completions >= *job.Spec.Completions { complete = true if active > 0 { jm.recorder.Event(&job, api.EventTypeWarning, "TooManyActivePods", "Too many active pods running after completion count reached") } if completions > *job.Spec.Completions { jm.recorder.Event(&job, api.EventTypeWarning, "TooManySucceededPods", "Too many succeeded pods running after completion count reached") } } } if complete { job.Status.Conditions = append(job.Status.Conditions, newCondition(batch.JobComplete, "", "")) now := unversioned.Now() job.Status.CompletionTime = &now } } // no need to update the job if the status hasn't changed since last time if job.Status.Active != active || job.Status.Succeeded != succeeded || job.Status.Failed != failed || len(job.Status.Conditions) != conditions { job.Status.Active = active job.Status.Succeeded = succeeded job.Status.Failed = failed if err := jm.updateHandler(&job); err != nil { glog.Errorf("Failed to update job %v, requeuing. Error: %v", job.Name, err) jm.enqueueController(&job) } } return nil }
// manageReplicas checks and updates replicas for the given ReplicaSet. func (rsc *ReplicaSetController) manageReplicas(filteredPods []*api.Pod, rs *extensions.ReplicaSet) { diff := len(filteredPods) - int(rs.Spec.Replicas) rsKey, err := controller.KeyFunc(rs) if err != nil { glog.Errorf("Couldn't get key for ReplicaSet %#v: %v", rs, err) return } if diff < 0 { diff *= -1 if diff > rsc.burstReplicas { diff = rsc.burstReplicas } // TODO: Track UIDs of creates just like deletes. The problem currently // is we'd need to wait on the result of a create to record the pod's // UID, which would require locking *across* the create, which will turn // into a performance bottleneck. We should generate a UID for the pod // beforehand and store it via ExpectCreations. rsc.expectations.ExpectCreations(rsKey, diff) wait := sync.WaitGroup{} wait.Add(diff) glog.V(2).Infof("Too few %q/%q replicas, need %d, creating %d", rs.Namespace, rs.Name, rs.Spec.Replicas, diff) for i := 0; i < diff; i++ { go func() { defer wait.Done() if err := rsc.podControl.CreatePods(rs.Namespace, &rs.Spec.Template, rs); err != nil { // Decrement the expected number of creates because the informer won't observe this pod glog.V(2).Infof("Failed creation, decrementing expectations for replica set %q/%q", rs.Namespace, rs.Name) rsc.expectations.CreationObserved(rsKey) utilruntime.HandleError(err) } }() } wait.Wait() } else if diff > 0 { if diff > rsc.burstReplicas { diff = rsc.burstReplicas } glog.V(2).Infof("Too many %q/%q replicas, need %d, deleting %d", rs.Namespace, rs.Name, rs.Spec.Replicas, diff) // No need to sort pods if we are about to delete all of them if rs.Spec.Replicas != 0 { // Sort the pods in the order such that not-ready < ready, unscheduled // < scheduled, and pending < running. This ensures that we delete pods // in the earlier stages whenever possible. sort.Sort(controller.ActivePods(filteredPods)) } // Snapshot the UIDs (ns/name) of the pods we're expecting to see // deleted, so we know to record their expectations exactly once either // when we see it as an update of the deletion timestamp, or as a delete. // Note that if the labels on a pod/rs change in a way that the pod gets // orphaned, the rs will only wake up after the expectations have // expired even if other pods are deleted. deletedPodKeys := []string{} for i := 0; i < diff; i++ { deletedPodKeys = append(deletedPodKeys, controller.PodKey(filteredPods[i])) } rsc.expectations.ExpectDeletions(rsKey, deletedPodKeys) wait := sync.WaitGroup{} wait.Add(diff) for i := 0; i < diff; i++ { go func(ix int) { defer wait.Done() if err := rsc.podControl.DeletePod(rs.Namespace, filteredPods[ix].Name, rs); err != nil { // Decrement the expected number of deletes because the informer won't observe this deletion podKey := controller.PodKey(filteredPods[ix]) glog.V(2).Infof("Failed to delete %v, decrementing expectations for controller %q/%q", podKey, rs.Namespace, rs.Name) rsc.expectations.DeletionObserved(rsKey, podKey) utilruntime.HandleError(err) } }(i) } wait.Wait() } }
// syncReplicaSet will sync the ReplicaSet with the given key if it has had its expectations fulfilled, // meaning it did not expect to see any more of its pods created or deleted. This function is not meant to be // invoked concurrently with the same key. func (rsc *ReplicaSetController) syncReplicaSet(key string) error { startTime := time.Now() defer func() { glog.V(4).Infof("Finished syncing replica set %q (%v)", key, time.Now().Sub(startTime)) }() if !rsc.podStoreSynced() { // Sleep so we give the pod reflector goroutine a chance to run. time.Sleep(PodStoreSyncedPollPeriod) glog.Infof("Waiting for pods controller to sync, requeuing ReplicaSet %v", key) rsc.queue.Add(key) return nil } obj, exists, err := rsc.rsStore.Store.GetByKey(key) if !exists { glog.Infof("ReplicaSet has been deleted %v", key) rsc.expectations.DeleteExpectations(key) return nil } if err != nil { glog.Infof("Unable to retrieve ReplicaSet %v from store: %v", key, err) rsc.queue.Add(key) return err } rs := *obj.(*extensions.ReplicaSet) // Check the expectations of the ReplicaSet before counting active pods, otherwise a new pod can sneak // in and update the expectations after we've retrieved active pods from the store. If a new pod enters // the store after we've checked the expectation, the ReplicaSet sync is just deferred till the next // relist. rsKey, err := controller.KeyFunc(&rs) if err != nil { glog.Errorf("Couldn't get key for ReplicaSet %#v: %v", rs, err) return err } rsNeedsSync := rsc.expectations.SatisfiedExpectations(rsKey) selector, err := unversioned.LabelSelectorAsSelector(rs.Spec.Selector) if err != nil { glog.Errorf("Error converting pod selector to selector: %v", err) return err } podList, err := rsc.podStore.Pods(rs.Namespace).List(selector) if err != nil { glog.Errorf("Error getting pods for ReplicaSet %q: %v", key, err) rsc.queue.Add(key) return err } // TODO: Do this in a single pass, or use an index. filteredPods := controller.FilterActivePods(podList.Items) if rsNeedsSync { rsc.manageReplicas(filteredPods, &rs) } // Count the number of pods that have labels matching the labels of the pod // template of the replicaSet, the matching pods may have more labels than // are in the template. Because the label of podTemplateSpec is a superset // of the selector of the replicaset, so the possible matching pods must be // part of the filteredPods. fullyLabeledReplicasCount := 0 templateLabel := labels.Set(rs.Spec.Template.Labels).AsSelector() for _, pod := range filteredPods { if templateLabel.Matches(labels.Set(pod.Labels)) { fullyLabeledReplicasCount++ } } // Always updates status as pods come up or die. if err := updateReplicaCount(rsc.kubeClient.Extensions().ReplicaSets(rs.Namespace), rs, len(filteredPods), fullyLabeledReplicasCount); err != nil { // Multiple things could lead to this update failing. Requeuing the replica set ensures // we retry with some fairness. glog.V(2).Infof("Failed to update replica count for controller %v/%v; requeuing; error: %v", rs.Namespace, rs.Name, err) rsc.enqueueReplicaSet(&rs) } return nil }
func (dsc *DaemonSetsController) manage(ds *extensions.DaemonSet) { // Find out which nodes are running the daemon pods selected by ds. nodeToDaemonPods, err := dsc.getNodesToDaemonPods(ds) if err != nil { glog.Errorf("Error getting node to daemon pod mapping for daemon set %+v: %v", ds, err) } // For each node, if the node is running the daemon pod but isn't supposed to, kill the daemon // pod. If the node is supposed to run the daemon pod, but isn't, create the daemon pod on the node. nodeList, err := dsc.nodeStore.List() if err != nil { glog.Errorf("Couldn't get list of nodes when syncing daemon set %+v: %v", ds, err) } var nodesNeedingDaemonPods, podsToDelete []string for _, node := range nodeList.Items { shouldRun := dsc.nodeShouldRunDaemonPod(&node, ds) daemonPods, isRunning := nodeToDaemonPods[node.Name] switch { case shouldRun && !isRunning: // If daemon pod is supposed to be running on node, but isn't, create daemon pod. nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, node.Name) case shouldRun && len(daemonPods) > 1: // If daemon pod is supposed to be running on node, but more than 1 daemon pod is running, delete the excess daemon pods. // Sort the daemon pods by creation time, so the the oldest is preserved. sort.Sort(podByCreationTimestamp(daemonPods)) for i := 1; i < len(daemonPods); i++ { podsToDelete = append(podsToDelete, daemonPods[i].Name) } case !shouldRun && isRunning: // If daemon pod isn't supposed to run on node, but it is, delete all daemon pods on node. for i := range daemonPods { podsToDelete = append(podsToDelete, daemonPods[i].Name) } } } // We need to set expectations before creating/deleting pods to avoid race conditions. dsKey, err := controller.KeyFunc(ds) if err != nil { glog.Errorf("Couldn't get key for object %+v: %v", ds, err) return } createDiff := len(nodesNeedingDaemonPods) deleteDiff := len(podsToDelete) if createDiff > dsc.burstReplicas { createDiff = dsc.burstReplicas } if deleteDiff > dsc.burstReplicas { deleteDiff = dsc.burstReplicas } dsc.expectations.SetExpectations(dsKey, createDiff, deleteDiff) glog.V(4).Infof("Nodes needing daemon pods for daemon set %s: %+v, creating %d", ds.Name, nodesNeedingDaemonPods, createDiff) createWait := sync.WaitGroup{} createWait.Add(createDiff) for i := 0; i < createDiff; i++ { go func(ix int) { defer createWait.Done() if err := dsc.podControl.CreatePodsOnNode(nodesNeedingDaemonPods[ix], ds.Namespace, &ds.Spec.Template, ds); err != nil { glog.V(2).Infof("Failed creation, decrementing expectations for set %q/%q", ds.Namespace, ds.Name) dsc.expectations.CreationObserved(dsKey) utilruntime.HandleError(err) } }(i) } createWait.Wait() glog.V(4).Infof("Pods to delete for daemon set %s: %+v, deleting %d", ds.Name, podsToDelete, deleteDiff) deleteWait := sync.WaitGroup{} deleteWait.Add(deleteDiff) for i := 0; i < deleteDiff; i++ { go func(ix int) { defer deleteWait.Done() if err := dsc.podControl.DeletePod(ds.Namespace, podsToDelete[ix], ds); err != nil { glog.V(2).Infof("Failed deletion, decrementing expectations for set %q/%q", ds.Namespace, ds.Name) dsc.expectations.DeletionObserved(dsKey) utilruntime.HandleError(err) } }(i) } deleteWait.Wait() }