func filterActiveJobs(jobs *batchv1.JobList) (active []*batchv1.Job) { for i := range jobs.Items { j := jobs.Items[i] if !job.IsJobFinished(&j) { active = append(active, &j) } } return }
// waitForAnyFinishedJob waits for any completed job to appear. func waitForAnyFinishedJob(c clientset.Interface, ns string) error { return wait.Poll(framework.Poll, cronJobTimeout, func() (bool, error) { jobs, err := c.Batch().Jobs(ns).List(v1.ListOptions{}) if err != nil { return false, err } for i := range jobs.Items { if job.IsJobFinished(&jobs.Items[i]) { return true, nil } } return false, nil }) }
// SyncOne reconciles a CronJob with a list of any Jobs that it created. // All known jobs created by "sj" should be included in "js". // The current time is passed in to facilitate testing. // It has no receiver, to facilitate testing. func SyncOne(sj batch.CronJob, js []batch.Job, now time.Time, jc jobControlInterface, sjc sjControlInterface, pc podControlInterface, recorder record.EventRecorder) { nameForLog := fmt.Sprintf("%s/%s", sj.Namespace, sj.Name) for i := range js { j := js[i] found := inActiveList(sj, j.ObjectMeta.UID) if !found && !job.IsJobFinished(&j) { recorder.Eventf(&sj, api.EventTypeWarning, "UnexpectedJob", "Saw a job that the controller did not create or forgot: %v", j.Name) // We found an unfinished job that has us as the parent, but it is not in our Active list. // This could happen if we crashed right after creating the Job and before updating the status, // or if our jobs list is newer than our sj status after a relist, or if someone intentionally created // a job that they wanted us to adopt. // TODO: maybe handle the adoption case? Concurrency/suspend rules will not apply in that case, obviously, since we can't // stop users from creating jobs if they have permission. It is assumed that if a // user has permission to create a job within a namespace, then they have permission to make any scheduledJob // in the same namespace "adopt" that job. ReplicaSets and their Pods work the same way. // TBS: how to update sj.Status.LastScheduleTime if the adopted job is newer than any we knew about? } else if found && job.IsJobFinished(&j) { deleteFromActiveList(&sj, j.ObjectMeta.UID) // TODO: event to call out failure vs success. recorder.Eventf(&sj, api.EventTypeNormal, "SawCompletedJob", "Saw completed job: %v", j.Name) } } updatedSJ, err := sjc.UpdateStatus(&sj) if err != nil { glog.Errorf("Unable to update status for %s (rv = %s): %v", nameForLog, sj.ResourceVersion, err) } sj = *updatedSJ if sj.Spec.Suspend != nil && *sj.Spec.Suspend { glog.V(4).Infof("Not starting job for %s because it is suspended", nameForLog) return } times, err := getRecentUnmetScheduleTimes(sj, now) if err != nil { glog.Errorf("Cannot determine if %s needs to be started: %v", nameForLog, err) } // TODO: handle multiple unmet start times, from oldest to newest, updating status as needed. if len(times) == 0 { glog.V(4).Infof("No unmet start times for %s", nameForLog) return } if len(times) > 1 { glog.V(4).Infof("Multiple unmet start times for %s so only starting last one", nameForLog) } scheduledTime := times[len(times)-1] tooLate := false if sj.Spec.StartingDeadlineSeconds != nil { tooLate = scheduledTime.Add(time.Second * time.Duration(*sj.Spec.StartingDeadlineSeconds)).Before(now) } if tooLate { glog.V(4).Infof("Missed starting window for %s", nameForLog) // TODO: generate an event for a miss. Use a warning level event because it indicates a // problem with the controller (restart or long queue), and is not expected by user either. // Since we don't set LastScheduleTime when not scheduling, we are going to keep noticing // the miss every cycle. In order to avoid sending multiple events, and to avoid processing // the sj again and again, we could set a Status.LastMissedTime when we notice a miss. // Then, when we call getRecentUnmetScheduleTimes, we can take max(creationTimestamp, // Status.LastScheduleTime, Status.LastMissedTime), and then so we won't generate // and event the next time we process it, and also so the user looking at the status // can see easily that there was a missed execution. return } if sj.Spec.ConcurrencyPolicy == batch.ForbidConcurrent && len(sj.Status.Active) > 0 { // Regardless which source of information we use for the set of active jobs, // there is some risk that we won't see an active job when there is one. // (because we haven't seen the status update to the SJ or the created pod). // So it is theoretically possible to have concurrency with Forbid. // As long the as the invokations are "far enough apart in time", this usually won't happen. // // TODO: for Forbid, we could use the same name for every execution, as a lock. // With replace, we could use a name that is deterministic per execution time. // But that would mean that you could not inspect prior successes or failures of Forbid jobs. glog.V(4).Infof("Not starting job for %s because of prior execution still running and concurrency policy is Forbid", nameForLog) return } if sj.Spec.ConcurrencyPolicy == batch.ReplaceConcurrent { for _, j := range sj.Status.Active { // TODO: this should be replaced with server side job deletion // currently this mimics JobReaper from pkg/kubectl/stop.go glog.V(4).Infof("Deleting job %s of %s that was still running at next scheduled start time", j.Name, nameForLog) job, err := jc.GetJob(j.Namespace, j.Name) if err != nil { recorder.Eventf(&sj, api.EventTypeWarning, "FailedGet", "Get job: %v", err) return } // scale job down to 0 if *job.Spec.Parallelism != 0 { zero := int32(0) job.Spec.Parallelism = &zero job, err = jc.UpdateJob(job.Namespace, job) if err != nil { recorder.Eventf(&sj, api.EventTypeWarning, "FailedUpdate", "Update job: %v", err) return } } // remove all pods... selector, _ := unversioned.LabelSelectorAsSelector(job.Spec.Selector) options := api.ListOptions{LabelSelector: selector} podList, err := pc.ListPods(job.Namespace, options) if err != nil { recorder.Eventf(&sj, api.EventTypeWarning, "FailedList", "List job-pods: %v", err) } errList := []error{} for _, pod := range podList.Items { glog.V(2).Infof("CronJob controller is deleting Pod %v/%v", pod.Namespace, pod.Name) if err := pc.DeletePod(pod.Namespace, pod.Name); err != nil { // ignores the error when the pod isn't found if !errors.IsNotFound(err) { errList = append(errList, err) } } } if len(errList) != 0 { recorder.Eventf(&sj, api.EventTypeWarning, "FailedDelete", "Deleted job-pods: %v", utilerrors.NewAggregate(errList)) return } // ... the job itself... if err := jc.DeleteJob(job.Namespace, job.Name); err != nil { recorder.Eventf(&sj, api.EventTypeWarning, "FailedDelete", "Deleted job: %v", err) glog.Errorf("Error deleting job %s from %s: %v", job.Name, nameForLog, err) return } // ... and its reference from active list deleteFromActiveList(&sj, job.ObjectMeta.UID) recorder.Eventf(&sj, api.EventTypeNormal, "SuccessfulDelete", "Deleted job %v", j.Name) } } jobReq, err := getJobFromTemplate(&sj, scheduledTime) if err != nil { glog.Errorf("Unable to make Job from template in %s: %v", nameForLog, err) return } jobResp, err := jc.CreateJob(sj.Namespace, jobReq) if err != nil { recorder.Eventf(&sj, api.EventTypeWarning, "FailedCreate", "Error creating job: %v", err) return } glog.V(4).Infof("Created Job %s for %s", jobResp.Name, nameForLog) recorder.Eventf(&sj, api.EventTypeNormal, "SuccessfulCreate", "Created job %v", jobResp.Name) // ------------------------------------------------------------------ // // If this process restarts at this point (after posting a job, but // before updating the status), then we might try to start the job on // the next time. Actually, if we relist the SJs and Jobs on the next // iteration of SyncAll, we might not see our own status update, and // then post one again. So, we need to use the job name as a lock to // prevent us from making the job twice (name the job with hash of its // scheduled time). // Add the just-started job to the status list. ref, err := getRef(jobResp) if err != nil { glog.V(2).Infof("Unable to make object reference for job for %s", nameForLog) } else { sj.Status.Active = append(sj.Status.Active, *ref) } sj.Status.LastScheduleTime = &unversioned.Time{Time: scheduledTime} if _, err := sjc.UpdateStatus(&sj); err != nil { glog.Infof("Unable to update status for %s (rv = %s): %v", nameForLog, sj.ResourceVersion, err) } return }