Ejemplo n.º 1
0
// syncJob will sync the job with the given key if it has had its expectations fulfilled, meaning
// it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked
// concurrently with the same key.
func (jm *JobController) syncJob(key string) error {
	startTime := time.Now()
	defer func() {
		glog.V(4).Infof("Finished syncing job %q (%v)", key, time.Now().Sub(startTime))
	}()

	if !jm.podStoreSynced() {
		// Sleep so we give the pod reflector goroutine a chance to run.
		time.Sleep(replicationcontroller.PodStoreSyncedPollPeriod)
		glog.V(4).Infof("Waiting for pods controller to sync, requeuing job %v", key)
		jm.queue.Add(key)
		return nil
	}

	obj, exists, err := jm.jobStore.Store.GetByKey(key)
	if !exists {
		glog.V(4).Infof("Job has been deleted: %v", key)
		jm.expectations.DeleteExpectations(key)
		return nil
	}
	if err != nil {
		glog.Errorf("Unable to retrieve job %v from store: %v", key, err)
		jm.queue.Add(key)
		return err
	}
	job := *obj.(*batch.Job)

	// Check the expectations of the job before counting active pods, otherwise a new pod can sneak in
	// and update the expectations after we've retrieved active pods from the store. If a new pod enters
	// the store after we've checked the expectation, the job sync is just deferred till the next relist.
	jobKey, err := controller.KeyFunc(&job)
	if err != nil {
		glog.Errorf("Couldn't get key for job %#v: %v", job, err)
		return err
	}
	jobNeedsSync := jm.expectations.SatisfiedExpectations(jobKey)
	selector, _ := unversioned.LabelSelectorAsSelector(job.Spec.Selector)
	podList, err := jm.podStore.Pods(job.Namespace).List(selector)
	if err != nil {
		glog.Errorf("Error getting pods for job %q: %v", key, err)
		jm.queue.Add(key)
		return err
	}

	activePods := controller.FilterActivePods(podList.Items)
	active := int32(len(activePods))
	succeeded, failed := getStatus(podList.Items)
	conditions := len(job.Status.Conditions)
	if job.Status.StartTime == nil {
		now := unversioned.Now()
		job.Status.StartTime = &now
	}
	// if job was finished previously, we don't want to redo the termination
	if isJobFinished(&job) {
		return nil
	}
	if pastActiveDeadline(&job) {
		// TODO: below code should be replaced with pod termination resulting in
		// pod failures, rather than killing pods. Unfortunately none such solution
		// exists ATM. There's an open discussion in the topic in
		// https://github.com/kubernetes/kubernetes/issues/14602 which might give
		// some sort of solution to above problem.
		// kill remaining active pods
		wait := sync.WaitGroup{}
		wait.Add(int(active))
		for i := int32(0); i < active; i++ {
			go func(ix int32) {
				defer wait.Done()
				if err := jm.podControl.DeletePod(job.Namespace, activePods[ix].Name, &job); err != nil {
					defer utilruntime.HandleError(err)
				}
			}(i)
		}
		wait.Wait()
		// update status values accordingly
		failed += active
		active = 0
		job.Status.Conditions = append(job.Status.Conditions, newCondition(batch.JobFailed, "DeadlineExceeded", "Job was active longer than specified deadline"))
		jm.recorder.Event(&job, api.EventTypeNormal, "DeadlineExceeded", "Job was active longer than specified deadline")
	} else {
		if jobNeedsSync {
			active = jm.manageJob(activePods, succeeded, &job)
		}
		completions := succeeded
		complete := false
		if job.Spec.Completions == nil {
			// This type of job is complete when any pod exits with success.
			// Each pod is capable of
			// determining whether or not the entire Job is done.  Subsequent pods are
			// not expected to fail, but if they do, the failure is ignored.  Once any
			// pod succeeds, the controller waits for remaining pods to finish, and
			// then the job is complete.
			if succeeded > 0 && active == 0 {
				complete = true
			}
		} else {
			// Job specifies a number of completions.  This type of job signals
			// success by having that number of successes.  Since we do not
			// start more pods than there are remaining completions, there should
			// not be any remaining active pods once this count is reached.
			if completions >= *job.Spec.Completions {
				complete = true
				if active > 0 {
					jm.recorder.Event(&job, api.EventTypeWarning, "TooManyActivePods", "Too many active pods running after completion count reached")
				}
				if completions > *job.Spec.Completions {
					jm.recorder.Event(&job, api.EventTypeWarning, "TooManySucceededPods", "Too many succeeded pods running after completion count reached")
				}
			}
		}
		if complete {
			job.Status.Conditions = append(job.Status.Conditions, newCondition(batch.JobComplete, "", ""))
			now := unversioned.Now()
			job.Status.CompletionTime = &now
		}
	}

	// no need to update the job if the status hasn't changed since last time
	if job.Status.Active != active || job.Status.Succeeded != succeeded || job.Status.Failed != failed || len(job.Status.Conditions) != conditions {
		job.Status.Active = active
		job.Status.Succeeded = succeeded
		job.Status.Failed = failed

		if err := jm.updateHandler(&job); err != nil {
			glog.Errorf("Failed to update job %v, requeuing.  Error: %v", job.Name, err)
			jm.enqueueController(&job)
		}
	}
	return nil
}
Ejemplo n.º 2
0
// syncReplicaSet will sync the ReplicaSet with the given key if it has had its expectations fulfilled,
// meaning it did not expect to see any more of its pods created or deleted. This function is not meant to be
// invoked concurrently with the same key.
func (rsc *ReplicaSetController) syncReplicaSet(key string) error {
	startTime := time.Now()
	defer func() {
		glog.V(4).Infof("Finished syncing replica set %q (%v)", key, time.Now().Sub(startTime))
	}()

	if !rsc.podStoreSynced() {
		// Sleep so we give the pod reflector goroutine a chance to run.
		time.Sleep(PodStoreSyncedPollPeriod)
		glog.Infof("Waiting for pods controller to sync, requeuing ReplicaSet %v", key)
		rsc.queue.Add(key)
		return nil
	}

	obj, exists, err := rsc.rsStore.Store.GetByKey(key)
	if !exists {
		glog.Infof("ReplicaSet has been deleted %v", key)
		rsc.expectations.DeleteExpectations(key)
		return nil
	}
	if err != nil {
		glog.Infof("Unable to retrieve ReplicaSet %v from store: %v", key, err)
		rsc.queue.Add(key)
		return err
	}
	rs := *obj.(*extensions.ReplicaSet)

	// Check the expectations of the ReplicaSet before counting active pods, otherwise a new pod can sneak
	// in and update the expectations after we've retrieved active pods from the store. If a new pod enters
	// the store after we've checked the expectation, the ReplicaSet sync is just deferred till the next
	// relist.
	rsKey, err := controller.KeyFunc(&rs)
	if err != nil {
		glog.Errorf("Couldn't get key for ReplicaSet %#v: %v", rs, err)
		return err
	}
	rsNeedsSync := rsc.expectations.SatisfiedExpectations(rsKey)
	selector, err := unversioned.LabelSelectorAsSelector(rs.Spec.Selector)
	if err != nil {
		glog.Errorf("Error converting pod selector to selector: %v", err)
		return err
	}
	podList, err := rsc.podStore.Pods(rs.Namespace).List(selector)
	if err != nil {
		glog.Errorf("Error getting pods for ReplicaSet %q: %v", key, err)
		rsc.queue.Add(key)
		return err
	}

	// TODO: Do this in a single pass, or use an index.
	filteredPods := controller.FilterActivePods(podList.Items)
	if rsNeedsSync {
		rsc.manageReplicas(filteredPods, &rs)
	}

	// Count the number of pods that have labels matching the labels of the pod
	// template of the replicaSet, the matching pods may have more labels than
	// are in the template. Because the label of podTemplateSpec is a superset
	// of the selector of the replicaset, so the possible matching pods must be
	// part of the filteredPods.
	fullyLabeledReplicasCount := 0
	templateLabel := labels.Set(rs.Spec.Template.Labels).AsSelector()
	for _, pod := range filteredPods {
		if templateLabel.Matches(labels.Set(pod.Labels)) {
			fullyLabeledReplicasCount++
		}
	}

	// Always updates status as pods come up or die.
	if err := updateReplicaCount(rsc.kubeClient.Extensions().ReplicaSets(rs.Namespace), rs, len(filteredPods), fullyLabeledReplicasCount); err != nil {
		// Multiple things could lead to this update failing. Requeuing the replica set ensures
		// we retry with some fairness.
		glog.V(2).Infof("Failed to update replica count for controller %v/%v; requeuing; error: %v", rs.Namespace, rs.Name, err)
		rsc.enqueueReplicaSet(&rs)
	}
	return nil
}