// recoverablePullError wraps the error gotten when trying to pull and image if // the error is recoverable. func (d *DockerDriver) recoverablePullError(err error, image string) error { recoverable := true if imageNotFoundMatcher.MatchString(err.Error()) { recoverable = false } return dstructs.NewRecoverableError(fmt.Errorf("Failed to pull `%s`: %s", image, err), recoverable) }
func TestClient_RestartTracker_StartError_Recoverable_Fail(t *testing.T) { t.Parallel() p := testPolicy(true, structs.RestartPolicyModeFail) rt := newRestartTracker(p, structs.JobTypeSystem) recErr := cstructs.NewRecoverableError(fmt.Errorf("foo"), true) for i := 0; i < p.Attempts; i++ { state, when := rt.SetStartError(recErr).GetState() if state != structs.TaskRestarting { t.Fatalf("NextRestart() returned %v, want %v", state, structs.TaskRestarting) } if !withinJitter(p.Delay, when) { t.Fatalf("NextRestart() returned %v; want %v+jitter", when, p.Delay) } } // Next restart should cause fail if state, _ := rt.SetStartError(recErr).GetState(); state != structs.TaskNotRestarting { t.Fatalf("NextRestart() returned %v; want %v", state, structs.TaskNotRestarting) } }
func (r *TaskRunner) run() { // Predeclare things so we an jump to the RESTART var handleEmpty bool for { // Download the task's artifacts if !r.artifactsDownloaded && len(r.task.Artifacts) > 0 { r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts)) taskDir, ok := r.ctx.AllocDir.TaskDirs[r.task.Name] if !ok { err := fmt.Errorf("task directory couldn't be found") r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(err)) r.logger.Printf("[ERR] client: task directory for alloc %q task %q couldn't be found", r.alloc.ID, r.task.Name) r.restartTracker.SetStartError(err) goto RESTART } for _, artifact := range r.task.Artifacts { if err := getter.GetArtifact(r.taskEnv, artifact, taskDir, r.logger); err != nil { r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(err)) r.restartTracker.SetStartError(cstructs.NewRecoverableError(err, true)) goto RESTART } } r.artifactsDownloaded = true } // Start the task if not yet started or it is being forced. This logic // is necessary because in the case of a restore the handle already // exists. r.handleLock.Lock() handleEmpty = r.handle == nil r.handleLock.Unlock() if handleEmpty { startErr := r.startTask() r.restartTracker.SetStartError(startErr) if startErr != nil { r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr)) goto RESTART } } // Mark the task as started r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) // Wait for updates WAIT: for { select { case waitRes := <-r.handle.WaitCh(): if waitRes == nil { panic("nil wait") } // Log whether the task was successful or not. r.restartTracker.SetWaitResult(waitRes) r.setState(structs.TaskStateDead, r.waitErrorToEvent(waitRes)) if !waitRes.Successful() { r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes) } else { r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID) } break WAIT case update := <-r.updateCh: if err := r.handleUpdate(update); err != nil { r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err) } case <-r.destroyCh: // Kill the task using an exponential backoff in-case of failures. destroySuccess, err := r.handleDestroy() if !destroySuccess { // We couldn't successfully destroy the resource created. r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err) } // Store that the task has been destroyed and any associated error. r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskKilled).SetKillError(err)) return } } RESTART: state, when := r.restartTracker.GetState() r.restartTracker.SetStartError(nil).SetWaitResult(nil) reason := r.restartTracker.GetReason() switch state { case structs.TaskNotRestarting, structs.TaskTerminated: r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID) if state == structs.TaskNotRestarting { r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskNotRestarting). SetRestartReason(reason)) } return case structs.TaskRestarting: r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when) r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskRestarting). SetRestartDelay(when). SetRestartReason(reason)) default: r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state) return } // Sleep but watch for destroy events. select { case <-time.After(when): case <-r.destroyCh: } // Destroyed while we were waiting to restart, so abort. r.destroyLock.Lock() destroyed := r.destroy r.destroyLock.Unlock() if destroyed { r.logger.Printf("[DEBUG] client: Not restarting task: %v because it's destroyed by user", r.task.Name) r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskKilled)) return } // Clear the handle so a new driver will be created. r.handleLock.Lock() r.handle = nil r.handleLock.Unlock() } }