// startTask is used to start the task if there is no handle func (r *TaskRunner) startTask() error { // Create a driver driver, err := r.createDriver() if err != nil { e := structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(err) r.setState(structs.TaskStateDead, e) return err } // Start the job handle, err := driver.Start(r.ctx, r.task) if err != nil { r.logger.Printf("[ERR] client: failed to start task '%s' for alloc '%s': %v", r.task.Name, r.alloc.ID, err) e := structs.NewTaskEvent(structs.TaskDriverFailure). SetDriverError(fmt.Errorf("failed to start: %v", err)) r.setState(structs.TaskStateDead, e) return err } r.handleLock.Lock() r.handle = handle r.handleLock.Unlock() r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) return nil }
// updatedTokenHandler is called when a new Vault token is retrieved. Things // that rely on the token should be updated here. func (r *TaskRunner) updatedTokenHandler() { // Update the tasks environment if err := r.setTaskEnv(); err != nil { r.setState( structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) return } if r.templateManager != nil { r.templateManager.Stop() // Create a new templateManager var err error r.templateManager, err = NewTaskTemplateManager(r, r.task.Templates, r.config, r.vaultFuture.Get(), r.taskDir, r.getTaskEnv()) if err != nil { err := fmt.Errorf("failed to build task's template manager: %v", err) r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err) r.Kill("vault", err.Error(), true) return } } }
func TestTaskRunner_SaveRestoreState(t *testing.T) { ctestutil.ExecCompatible(t) upd, tr := testTaskRunner(false) // Change command to ensure we run for a bit tr.task.Config["command"] = "/bin/sleep" tr.task.Config["args"] = []string{"10"} go tr.Run() defer tr.Destroy(structs.NewTaskEvent(structs.TaskKilled)) // Snapshot state time.Sleep(2 * time.Second) if err := tr.SaveState(); err != nil { t.Fatalf("err: %v", err) } // Create a new task runner tr2 := NewTaskRunner(tr.logger, tr.config, upd.Update, tr.ctx, tr.alloc, &structs.Task{Name: tr.task.Name}) if err := tr2.RestoreState(); err != nil { t.Fatalf("err: %v", err) } go tr2.Run() defer tr2.Destroy(structs.NewTaskEvent(structs.TaskKilled)) // Destroy and wait testutil.WaitForResult(func() (bool, error) { return tr2.handle != nil, fmt.Errorf("RestoreState() didn't open handle") }, func(err error) { t.Fatalf("err: %v", err) }) }
// Signal will send a signal to the task func (r *TaskRunner) Signal(source, reason string, s os.Signal) error { reasonStr := fmt.Sprintf("%s: %s", source, reason) event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetTaskSignalReason(reasonStr) r.logger.Printf("[DEBUG] client: sending signal %v to task %v for alloc %q", s, r.task.Name, r.alloc.ID) r.runningLock.Lock() running := r.running r.runningLock.Unlock() // Drop the restart event if !running { r.logger.Printf("[DEBUG] client: skipping signal since task isn't running") return nil } resCh := make(chan error) se := SignalEvent{ s: s, e: event, result: resCh, } select { case r.signalCh <- se: case <-r.waitCh: } return <-resCh }
// NewTaskRunner is used to create a new task context func NewTaskRunner(logger *log.Logger, config *config.Config, updater TaskStateUpdater, ctx *driver.ExecContext, alloc *structs.Allocation, task *structs.Task, consulService *ConsulService) *TaskRunner { // Merge in the task resources task.Resources = alloc.TaskResources[task.Name] // Build the restart tracker. tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) if tg == nil { logger.Printf("[ERR] client: alloc '%s' for missing task group '%s'", alloc.ID, alloc.TaskGroup) return nil } restartTracker := newRestartTracker(tg.RestartPolicy, alloc.Job.Type) tc := &TaskRunner{ config: config, updater: updater, logger: logger, restartTracker: restartTracker, consulService: consulService, ctx: ctx, alloc: alloc, task: task, updateCh: make(chan *structs.Allocation, 8), destroyCh: make(chan struct{}), waitCh: make(chan struct{}), } // Set the state to pending. tc.updater(task.Name, structs.TaskStatePending, structs.NewTaskEvent(structs.TaskReceived)) return tc }
// setTaskState is used to set the status of a task func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEvent) { r.taskStatusLock.Lock() defer r.taskStatusLock.Unlock() taskState, ok := r.taskStates[taskName] if !ok { taskState = &structs.TaskState{} r.taskStates[taskName] = taskState } // Set the tasks state. taskState.State = state r.appendTaskEvent(taskState, event) if state == structs.TaskStateDead { // If the task failed, we should kill all the other tasks in the task group. if taskState.Failed() { var destroyingTasks []string for task, tr := range r.tasks { if task != taskName { destroyingTasks = append(destroyingTasks, task) tr.Destroy(structs.NewTaskEvent(structs.TaskSiblingFailed).SetFailedSibling(taskName)) } } if len(destroyingTasks) > 0 { r.logger.Printf("[DEBUG] client: task %q failed, destroying other tasks in task group: %v", taskName, destroyingTasks) } } } select { case r.dirtyCh <- struct{}{}: default: } }
func TestTaskRunner_SimpleRun(t *testing.T) { ctestutil.ExecCompatible(t) upd, tr := testTaskRunner(false) tr.MarkReceived() go tr.Run() defer tr.Destroy(structs.NewTaskEvent(structs.TaskKilled)) defer tr.ctx.AllocDir.Destroy() select { case <-tr.WaitCh(): case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): t.Fatalf("timeout") } if len(upd.events) != 3 { t.Fatalf("should have 3 updates: %#v", upd.events) } if upd.state != structs.TaskStateDead { t.Fatalf("TaskState %v; want %v", upd.state, structs.TaskStateDead) } if upd.events[0].Type != structs.TaskReceived { t.Fatalf("First Event was %v; want %v", upd.events[0].Type, structs.TaskReceived) } if upd.events[1].Type != structs.TaskStarted { t.Fatalf("Second Event was %v; want %v", upd.events[1].Type, structs.TaskStarted) } if upd.events[2].Type != structs.TaskTerminated { t.Fatalf("Third Event was %v; want %v", upd.events[2].Type, structs.TaskTerminated) } }
func TestTaskRunner_Validate_UserEnforcement(t *testing.T) { _, tr := testTaskRunner(false) defer tr.Destroy(structs.NewTaskEvent(structs.TaskKilled)) defer tr.ctx.AllocDir.Destroy() if err := tr.setTaskEnv(); err != nil { t.Fatalf("bad: %v", err) } // Try to run as root with exec. tr.task.Driver = "exec" tr.task.User = "******" if err := tr.validateTask(); err == nil { t.Fatalf("expected error running as root with exec") } // Try to run a non-blacklisted user with exec. tr.task.Driver = "exec" tr.task.User = "******" if err := tr.validateTask(); err != nil { t.Fatalf("unexpected error: %v", err) } // Try to run as root with docker. tr.task.Driver = "docker" tr.task.User = "******" if err := tr.validateTask(); err != nil { t.Fatalf("unexpected error: %v", err) } }
// checkResources monitors and enforces alloc resource usage. It returns an // appropriate task event describing why the allocation had to be killed. func (r *AllocRunner) checkResources() (*structs.TaskEvent, string) { diskSize := r.ctx.AllocDir.GetSize() diskLimit := r.Alloc().Resources.DiskInBytes() if diskSize > diskLimit { return structs.NewTaskEvent(structs.TaskDiskExceeded).SetDiskLimit(diskLimit).SetDiskSize(diskSize), "shared allocation directory exceeded the allowed disk space" } return nil, "" }
func TestTaskRunner_Update(t *testing.T) { ctestutil.ExecCompatible(t) _, tr := testTaskRunner(false) // Change command to ensure we run for a bit tr.task.Config["command"] = "/bin/sleep" tr.task.Config["args"] = []string{"100"} go tr.Run() defer tr.Destroy(structs.NewTaskEvent(structs.TaskKilled)) defer tr.ctx.AllocDir.Destroy() // Update the task definition updateAlloc := tr.alloc.Copy() // Update the restart policy newTG := updateAlloc.Job.TaskGroups[0] newMode := "foo" newTG.RestartPolicy.Mode = newMode newTask := updateAlloc.Job.TaskGroups[0].Tasks[0] newTask.Driver = "foobar" // Update the kill timeout testutil.WaitForResult(func() (bool, error) { if tr.handle == nil { return false, fmt.Errorf("task not started") } return true, nil }, func(err error) { t.Fatalf("err: %v", err) }) oldHandle := tr.handle.ID() newTask.KillTimeout = time.Hour tr.Update(updateAlloc) // Wait for update to take place testutil.WaitForResult(func() (bool, error) { if tr.task == newTask { return false, fmt.Errorf("We copied the pointer! This would be very bad") } if tr.task.Driver != newTask.Driver { return false, fmt.Errorf("Task not copied") } if tr.restartTracker.policy.Mode != newMode { return false, fmt.Errorf("restart policy not updated") } if tr.handle.ID() == oldHandle { return false, fmt.Errorf("handle not updated") } return true, nil }, func(err error) { t.Fatalf("err: %v", err) }) }
// Kill will kill a task and store the error, no longer restarting the task. If // fail is set, the task is marked as having failed. func (r *TaskRunner) Kill(source, reason string, fail bool) { reasonStr := fmt.Sprintf("%s: %s", source, reason) event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr) if fail { event.SetFailsTask() } r.logger.Printf("[DEBUG] client: killing task %v for alloc %q: %v", r.task.Name, r.alloc.ID, reasonStr) r.Destroy(event) }
func TestTaskRunner_KillTask(t *testing.T) { alloc := mock.Alloc() task := alloc.Job.TaskGroups[0].Tasks[0] task.Driver = "mock_driver" task.Config = map[string]interface{}{ "exit_code": "0", "run_for": "10s", } upd, tr := testTaskRunnerFromAlloc(false, alloc) tr.MarkReceived() go tr.Run() defer tr.Destroy(structs.NewTaskEvent(structs.TaskKilled)) defer tr.ctx.AllocDir.Destroy() go func() { time.Sleep(100 * time.Millisecond) tr.Kill("test", "kill", true) }() select { case <-tr.WaitCh(): case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): t.Fatalf("timeout") } if len(upd.events) != 4 { t.Fatalf("should have 4 updates: %#v", upd.events) } if upd.state != structs.TaskStateDead { t.Fatalf("TaskState %v; want %v", upd.state, structs.TaskStateDead) } if !upd.failed { t.Fatalf("TaskState should be failed: %+v", upd) } if upd.events[0].Type != structs.TaskReceived { t.Fatalf("First Event was %v; want %v", upd.events[0].Type, structs.TaskReceived) } if upd.events[1].Type != structs.TaskStarted { t.Fatalf("Second Event was %v; want %v", upd.events[1].Type, structs.TaskStarted) } if upd.events[2].Type != structs.TaskKilling { t.Fatalf("Third Event was %v; want %v", upd.events[2].Type, structs.TaskKilling) } if upd.events[3].Type != structs.TaskKilled { t.Fatalf("Fourth Event was %v; want %v", upd.events[3].Type, structs.TaskKilled) } }
// shouldRestart returns if the task should restart. If the return value is // true, the task's restart policy has already been considered and any wait time // between restarts has been applied. func (r *TaskRunner) shouldRestart() bool { state, when := r.restartTracker.GetState() reason := r.restartTracker.GetReason() switch state { case structs.TaskNotRestarting, structs.TaskTerminated: r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID) if state == structs.TaskNotRestarting { r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskNotRestarting). SetRestartReason(reason).SetFailsTask()) } return false case structs.TaskRestarting: r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when) r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskRestarting). SetRestartDelay(when). SetRestartReason(reason)) default: r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state) return false } // Sleep but watch for destroy events. select { case <-time.After(when): case <-r.destroyCh: } // Destroyed while we were waiting to restart, so abort. r.destroyLock.Lock() destroyed := r.destroy r.destroyLock.Unlock() if destroyed { r.logger.Printf("[DEBUG] client: Not restarting task: %v because it has been destroyed", r.task.Name) r.setState(structs.TaskStateDead, r.destroyEvent) return false } return true }
// Run is a long running routine used to manage the task func (r *TaskRunner) Run() { defer close(r.waitCh) r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')", r.task.Name, r.alloc.ID) if err := r.validateTask(); err != nil { r.setState( structs.TaskStateDead, structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(err)) return } if err := r.setTaskEnv(); err != nil { r.setState( structs.TaskStateDead, structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(err)) return } r.run() return }
// killTask kills the running task. A killing event can optionally be passed and // this event is used to mark the task as being killed. It provides a means to // store extra information. func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) { r.runningLock.Lock() running := r.running r.runningLock.Unlock() if !running { return } // Get the kill timeout timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout) // Build the event var event *structs.TaskEvent if killingEvent != nil { event = killingEvent event.Type = structs.TaskKilling } else { event = structs.NewTaskEvent(structs.TaskKilling) } event.SetKillTimeout(timeout) // Mark that we received the kill event r.setState(structs.TaskStateRunning, event) // Kill the task using an exponential backoff in-case of failures. destroySuccess, err := r.handleDestroy() if !destroySuccess { // We couldn't successfully destroy the resource created. r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err) } r.runningLock.Lock() r.running = false r.runningLock.Unlock() // Store that the task has been destroyed and any associated error. r.setState("", structs.NewTaskEvent(structs.TaskKilled).SetKillError(err)) }
// Run is a long running routine used to manage the task func (r *TaskRunner) Run() { defer close(r.waitCh) r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')", r.task.Name, r.alloc.ID) // Create the initial environment, this will be recreated if a Vault token // is needed if err := r.setTaskEnv(); err != nil { r.setState( structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err)) return } if err := r.validateTask(); err != nil { r.setState( structs.TaskStateDead, structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(err).SetFailsTask()) return } // If there is no Vault policy leave the static future created in // NewTaskRunner if r.task.Vault != nil { // Start the go-routine to get a Vault token r.vaultFuture.Clear() go r.vaultManager(r.recoveredVaultToken) } // Start the run loop r.run() // Do any cleanup necessary r.postrun() return }
func TestTaskRunner_DeriveToken_Unrecoverable(t *testing.T) { alloc := mock.Alloc() task := alloc.Job.TaskGroups[0].Tasks[0] task.Driver = "mock_driver" task.Config = map[string]interface{}{ "exit_code": "0", "run_for": "10s", } task.Vault = &structs.Vault{ Policies: []string{"default"}, ChangeMode: structs.VaultChangeModeRestart, } upd, tr := testTaskRunnerFromAlloc(false, alloc) tr.MarkReceived() defer tr.Destroy(structs.NewTaskEvent(structs.TaskKilled)) defer tr.ctx.AllocDir.Destroy() // Error the token derivation vc := tr.vaultClient.(*vaultclient.MockVaultClient) vc.SetDeriveTokenError(alloc.ID, []string{task.Name}, fmt.Errorf("Non recoverable")) go tr.Run() // Wait for the task to start testutil.WaitForResult(func() (bool, error) { if l := len(upd.events); l != 2 { return false, fmt.Errorf("Expect two events; got %v", l) } if upd.events[0].Type != structs.TaskReceived { return false, fmt.Errorf("First Event was %v; want %v", upd.events[0].Type, structs.TaskReceived) } if upd.events[1].Type != structs.TaskKilling { return false, fmt.Errorf("Second Event was %v; want %v", upd.events[1].Type, structs.TaskKilling) } return true, nil }, func(err error) { t.Fatalf("err: %v", err) }) }
func TestTaskRunner_SignalFailure(t *testing.T) { alloc := mock.Alloc() task := alloc.Job.TaskGroups[0].Tasks[0] task.Driver = "mock_driver" task.Config = map[string]interface{}{ "exit_code": "0", "run_for": "10s", "signal_error": "test forcing failure", } _, tr := testTaskRunnerFromAlloc(false, alloc) tr.MarkReceived() go tr.Run() defer tr.Destroy(structs.NewTaskEvent(structs.TaskKilled)) defer tr.ctx.AllocDir.Destroy() time.Sleep(100 * time.Millisecond) if err := tr.Signal("test", "test", syscall.SIGINT); err == nil { t.Fatalf("Didn't receive error") } }
func TestTaskRunner_Run_RecoverableStartError(t *testing.T) { alloc := mock.Alloc() task := alloc.Job.TaskGroups[0].Tasks[0] task.Driver = "mock_driver" task.Config = map[string]interface{}{ "exit_code": 0, "start_error": "driver failure", "start_error_recoverable": true, } upd, tr := testTaskRunnerFromAlloc(true, alloc) tr.MarkReceived() go tr.Run() defer tr.Destroy(structs.NewTaskEvent(structs.TaskKilled)) defer tr.ctx.AllocDir.Destroy() testutil.WaitForResult(func() (bool, error) { if l := len(upd.events); l < 3 { return false, fmt.Errorf("Expect at least three events; got %v", l) } if upd.events[0].Type != structs.TaskReceived { return false, fmt.Errorf("First Event was %v; want %v", upd.events[0].Type, structs.TaskReceived) } if upd.events[1].Type != structs.TaskDriverFailure { return false, fmt.Errorf("Second Event was %v; want %v", upd.events[1].Type, structs.TaskDriverFailure) } if upd.events[2].Type != structs.TaskRestarting { return false, fmt.Errorf("Second Event was %v; want %v", upd.events[2].Type, structs.TaskRestarting) } return true, nil }, func(err error) { t.Fatalf("err: %v", err) }) }
// Restart will restart the task func (r *TaskRunner) Restart(source, reason string) { reasonStr := fmt.Sprintf("%s: %s", source, reason) event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reasonStr) r.logger.Printf("[DEBUG] client: restarting task %v for alloc %q: %v", r.task.Name, r.alloc.ID, reasonStr) r.runningLock.Lock() running := r.running r.runningLock.Unlock() // Drop the restart event if !running { r.logger.Printf("[DEBUG] client: skipping restart since task isn't running") return } select { case r.restartCh <- event: case <-r.waitCh: } }
func TestTaskRunner_Download_Retries(t *testing.T) { ctestutil.ExecCompatible(t) // Create an allocation that has a task with bad artifacts. alloc := mock.Alloc() task := alloc.Job.TaskGroups[0].Tasks[0] artifact := structs.TaskArtifact{ GetterSource: "http://127.1.1.111:12315/foo/bar/baz", } task.Artifacts = []*structs.TaskArtifact{&artifact} // Make the restart policy try one update alloc.Job.TaskGroups[0].RestartPolicy = &structs.RestartPolicy{ Attempts: 1, Interval: 10 * time.Minute, Delay: 1 * time.Second, Mode: structs.RestartPolicyModeFail, } upd, tr := testTaskRunnerFromAlloc(true, alloc) tr.MarkReceived() go tr.Run() defer tr.Destroy(structs.NewTaskEvent(structs.TaskKilled)) defer tr.ctx.AllocDir.Destroy() select { case <-tr.WaitCh(): case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): t.Fatalf("timeout") } if len(upd.events) != 7 { t.Fatalf("should have 7 updates: %#v", upd.events) } if upd.state != structs.TaskStateDead { t.Fatalf("TaskState %v; want %v", upd.state, structs.TaskStateDead) } if upd.events[0].Type != structs.TaskReceived { t.Fatalf("First Event was %v; want %v", upd.events[0].Type, structs.TaskReceived) } if upd.events[1].Type != structs.TaskDownloadingArtifacts { t.Fatalf("Second Event was %v; want %v", upd.events[1].Type, structs.TaskDownloadingArtifacts) } if upd.events[2].Type != structs.TaskArtifactDownloadFailed { t.Fatalf("Third Event was %v; want %v", upd.events[2].Type, structs.TaskArtifactDownloadFailed) } if upd.events[3].Type != structs.TaskRestarting { t.Fatalf("Fourth Event was %v; want %v", upd.events[3].Type, structs.TaskRestarting) } if upd.events[4].Type != structs.TaskDownloadingArtifacts { t.Fatalf("Fifth Event was %v; want %v", upd.events[4].Type, structs.TaskDownloadingArtifacts) } if upd.events[5].Type != structs.TaskArtifactDownloadFailed { t.Fatalf("Sixth Event was %v; want %v", upd.events[5].Type, structs.TaskArtifactDownloadFailed) } if upd.events[6].Type != structs.TaskNotRestarting { t.Fatalf("Seventh Event was %v; want %v", upd.events[6].Type, structs.TaskNotRestarting) } }
// Run is a long running goroutine used to manage an allocation func (r *AllocRunner) Run() { defer close(r.waitCh) go r.dirtySyncState() // Find the task group to run in the allocation alloc := r.alloc tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) if tg == nil { r.logger.Printf("[ERR] client: alloc '%s' for missing task group '%s'", alloc.ID, alloc.TaskGroup) r.setStatus(structs.AllocClientStatusFailed, fmt.Sprintf("missing task group '%s'", alloc.TaskGroup)) return } // Create the execution context r.ctxLock.Lock() if r.ctx == nil { allocDir := allocdir.NewAllocDir(filepath.Join(r.config.AllocDir, r.alloc.ID), r.Alloc().Resources.DiskMB) if err := allocDir.Build(tg.Tasks); err != nil { r.logger.Printf("[WARN] client: failed to build task directories: %v", err) r.setStatus(structs.AllocClientStatusFailed, fmt.Sprintf("failed to build task dirs for '%s'", alloc.TaskGroup)) r.ctxLock.Unlock() return } r.ctx = driver.NewExecContext(allocDir, r.alloc.ID) if r.otherAllocDir != nil { if err := allocDir.Move(r.otherAllocDir, tg.Tasks); err != nil { r.logger.Printf("[ERROR] client: failed to move alloc dir into alloc %q: %v", r.alloc.ID, err) } if err := r.otherAllocDir.Destroy(); err != nil { r.logger.Printf("[ERROR] client: error destroying allocdir %v", r.otherAllocDir.AllocDir, err) } } } r.ctxLock.Unlock() // Check if the allocation is in a terminal status. In this case, we don't // start any of the task runners and directly wait for the destroy signal to // clean up the allocation. if alloc.TerminalStatus() { r.logger.Printf("[DEBUG] client: alloc %q in terminal status, waiting for destroy", r.alloc.ID) r.handleDestroy() r.logger.Printf("[DEBUG] client: terminating runner for alloc '%s'", r.alloc.ID) return } // Start the task runners r.logger.Printf("[DEBUG] client: starting task runners for alloc '%s'", r.alloc.ID) r.taskLock.Lock() for _, task := range tg.Tasks { if _, ok := r.restored[task.Name]; ok { continue } tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx, r.Alloc(), task.Copy(), r.vaultClient) r.tasks[task.Name] = tr tr.MarkReceived() go tr.Run() } r.taskLock.Unlock() // Start watching the shared allocation directory for disk usage go r.ctx.AllocDir.StartDiskWatcher() watchdog := time.NewTicker(watchdogInterval) defer watchdog.Stop() // taskDestroyEvent contains an event that caused the destroyment of a task // in the allocation. var taskDestroyEvent *structs.TaskEvent OUTER: // Wait for updates for { select { case update := <-r.updateCh: // Store the updated allocation. r.allocLock.Lock() r.alloc = update r.allocLock.Unlock() // Check if we're in a terminal status if update.TerminalStatus() { taskDestroyEvent = structs.NewTaskEvent(structs.TaskKilled) break OUTER } // Update the task groups runners := r.getTaskRunners() for _, tr := range runners { tr.Update(update) } case <-watchdog.C: if event, desc := r.checkResources(); event != nil { r.setStatus(structs.AllocClientStatusFailed, desc) taskDestroyEvent = event break OUTER } case <-r.destroyCh: taskDestroyEvent = structs.NewTaskEvent(structs.TaskKilled) break OUTER } } // Kill the task runners r.destroyTaskRunners(taskDestroyEvent) // Stop watching the shared allocation directory r.ctx.AllocDir.StopDiskWatcher() // Block until we should destroy the state of the alloc r.handleDestroy() r.logger.Printf("[DEBUG] client: terminating runner for alloc '%s'", r.alloc.ID) }
func TestTaskRunner_RestartTask(t *testing.T) { alloc := mock.Alloc() task := alloc.Job.TaskGroups[0].Tasks[0] task.Driver = "mock_driver" task.Config = map[string]interface{}{ "exit_code": "0", "run_for": "10s", } upd, tr := testTaskRunnerFromAlloc(true, alloc) tr.MarkReceived() go tr.Run() defer tr.Destroy(structs.NewTaskEvent(structs.TaskKilled)) defer tr.ctx.AllocDir.Destroy() go func() { time.Sleep(time.Duration(testutil.TestMultiplier()*300) * time.Millisecond) tr.Restart("test", "restart") time.Sleep(time.Duration(testutil.TestMultiplier()*300) * time.Millisecond) tr.Kill("test", "restart", false) }() select { case <-tr.WaitCh(): case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): t.Fatalf("timeout") } if len(upd.events) != 9 { t.Fatalf("should have 9 updates: %#v", upd.events) } if upd.state != structs.TaskStateDead { t.Fatalf("TaskState %v; want %v", upd.state, structs.TaskStateDead) } if upd.events[0].Type != structs.TaskReceived { t.Fatalf("First Event was %v; want %v", upd.events[0].Type, structs.TaskReceived) } if upd.events[1].Type != structs.TaskStarted { t.Fatalf("Second Event was %v; want %v", upd.events[1].Type, structs.TaskStarted) } if upd.events[2].Type != structs.TaskRestartSignal { t.Fatalf("Third Event was %v; want %v", upd.events[2].Type, structs.TaskRestartSignal) } if upd.events[3].Type != structs.TaskKilling { t.Fatalf("Fourth Event was %v; want %v", upd.events[3].Type, structs.TaskKilling) } if upd.events[4].Type != structs.TaskKilled { t.Fatalf("Fifth Event was %v; want %v", upd.events[4].Type, structs.TaskKilled) } t.Logf("%+v", upd.events[5]) if upd.events[5].Type != structs.TaskRestarting { t.Fatalf("Sixth Event was %v; want %v", upd.events[5].Type, structs.TaskRestarting) } if upd.events[6].Type != structs.TaskStarted { t.Fatalf("Seventh Event was %v; want %v", upd.events[7].Type, structs.TaskStarted) } if upd.events[7].Type != structs.TaskKilling { t.Fatalf("Eighth Event was %v; want %v", upd.events[7].Type, structs.TaskKilling) } if upd.events[8].Type != structs.TaskKilled { t.Fatalf("Nineth Event was %v; want %v", upd.events[8].Type, structs.TaskKilled) } }
func (r *TaskRunner) run() { // Predeclare things so we an jump to the RESTART var handleEmpty bool for { // Download the task's artifacts if !r.artifactsDownloaded && len(r.task.Artifacts) > 0 { r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts)) taskDir, ok := r.ctx.AllocDir.TaskDirs[r.task.Name] if !ok { err := fmt.Errorf("task directory couldn't be found") r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(err)) r.logger.Printf("[ERR] client: task directory for alloc %q task %q couldn't be found", r.alloc.ID, r.task.Name) r.restartTracker.SetStartError(err) goto RESTART } for _, artifact := range r.task.Artifacts { if err := getter.GetArtifact(r.taskEnv, artifact, taskDir, r.logger); err != nil { r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(err)) r.restartTracker.SetStartError(cstructs.NewRecoverableError(err, true)) goto RESTART } } r.artifactsDownloaded = true } // Start the task if not yet started or it is being forced. This logic // is necessary because in the case of a restore the handle already // exists. r.handleLock.Lock() handleEmpty = r.handle == nil r.handleLock.Unlock() if handleEmpty { startErr := r.startTask() r.restartTracker.SetStartError(startErr) if startErr != nil { r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr)) goto RESTART } } // Mark the task as started r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) // Wait for updates WAIT: for { select { case waitRes := <-r.handle.WaitCh(): if waitRes == nil { panic("nil wait") } // Log whether the task was successful or not. r.restartTracker.SetWaitResult(waitRes) r.setState(structs.TaskStateDead, r.waitErrorToEvent(waitRes)) if !waitRes.Successful() { r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes) } else { r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID) } break WAIT case update := <-r.updateCh: if err := r.handleUpdate(update); err != nil { r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err) } case <-r.destroyCh: // Kill the task using an exponential backoff in-case of failures. destroySuccess, err := r.handleDestroy() if !destroySuccess { // We couldn't successfully destroy the resource created. r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err) } // Store that the task has been destroyed and any associated error. r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskKilled).SetKillError(err)) return } } RESTART: state, when := r.restartTracker.GetState() r.restartTracker.SetStartError(nil).SetWaitResult(nil) reason := r.restartTracker.GetReason() switch state { case structs.TaskNotRestarting, structs.TaskTerminated: r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID) if state == structs.TaskNotRestarting { r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskNotRestarting). SetRestartReason(reason)) } return case structs.TaskRestarting: r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when) r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskRestarting). SetRestartDelay(when). SetRestartReason(reason)) default: r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state) return } // Sleep but watch for destroy events. select { case <-time.After(when): case <-r.destroyCh: } // Destroyed while we were waiting to restart, so abort. r.destroyLock.Lock() destroyed := r.destroy r.destroyLock.Unlock() if destroyed { r.logger.Printf("[DEBUG] client: Not restarting task: %v because it's destroyed by user", r.task.Name) r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskKilled)) return } // Clear the handle so a new driver will be created. r.handleLock.Lock() r.handle = nil r.handleLock.Unlock() } }
func (r *TaskRunner) run() { var forceStart bool for { // Start the task if not yet started or it is being forced. if r.handle == nil || forceStart { forceStart = false if err := r.startTask(); err != nil { return } } // Store the errors that caused use to stop waiting for updates. var waitRes *cstructs.WaitResult var destroyErr error destroyed := false // Register the services defined by the task with Consil r.consulService.Register(r.task, r.alloc) OUTER: // Wait for updates for { select { case waitRes = <-r.handle.WaitCh(): break OUTER case update := <-r.updateCh: if err := r.handleUpdate(update); err != nil { r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err) } case <-r.destroyCh: // Avoid destroying twice if destroyed { continue } // Send the kill signal, and use the WaitCh to block until complete if err := r.handle.Kill(); err != nil { r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc '%s': %v", r.task.Name, r.alloc.ID, err) destroyErr = err } destroyed = true } } // De-Register the services belonging to the task from consul r.consulService.Deregister(r.task, r.alloc) // If the user destroyed the task, we do not attempt to do any restarts. if destroyed { r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskKilled).SetKillError(destroyErr)) return } // Log whether the task was successful or not. if !waitRes.Successful() { r.logger.Printf("[ERR] client: failed to complete task '%s' for alloc '%s': %v", r.task.Name, r.alloc.ID, waitRes) } else { r.logger.Printf("[INFO] client: completed task '%s' for alloc '%s'", r.task.Name, r.alloc.ID) } // Check if we should restart. If not mark task as dead and exit. shouldRestart, when := r.restartTracker.NextRestart(waitRes.ExitCode) waitEvent := r.waitErrorToEvent(waitRes) if !shouldRestart { r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID) r.setState(structs.TaskStateDead, waitEvent) return } r.logger.Printf("[INFO] client: Restarting Task: %v", r.task.Name) r.logger.Printf("[DEBUG] client: Sleeping for %v before restarting Task %v", when, r.task.Name) r.setState(structs.TaskStatePending, waitEvent) // Sleep but watch for destroy events. select { case <-time.After(when): case <-r.destroyCh: } // Destroyed while we were waiting to restart, so abort. r.destroyLock.Lock() destroyed = r.destroy r.destroyLock.Unlock() if destroyed { r.logger.Printf("[DEBUG] client: Not restarting task: %v because it's destroyed by user", r.task.Name) r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskKilled)) return } // Set force start because we are restarting the task. forceStart = true } return }
// MarkReceived marks the task as received. func (r *TaskRunner) MarkReceived() { r.updater(r.task.Name, structs.TaskStatePending, structs.NewTaskEvent(structs.TaskReceived)) }
func TestTaskRunner_Template_Artifact(t *testing.T) { dir, err := os.Getwd() if err != nil { t.Fatal("bad: %v", err) } ts := httptest.NewServer(http.FileServer(http.Dir(filepath.Join(dir, "..")))) defer ts.Close() alloc := mock.Alloc() task := alloc.Job.TaskGroups[0].Tasks[0] task.Driver = "mock_driver" task.Config = map[string]interface{}{ "exit_code": "0", "run_for": "1s", } // Create an allocation that has a task that renders a template from an // artifact f1 := "CHANGELOG.md" artifact := structs.TaskArtifact{ GetterSource: fmt.Sprintf("%s/%s", ts.URL, f1), } task.Artifacts = []*structs.TaskArtifact{&artifact} task.Templates = []*structs.Template{ { SourcePath: "CHANGELOG.md", DestPath: "local/test", ChangeMode: structs.TemplateChangeModeNoop, }, } upd, tr := testTaskRunnerFromAlloc(false, alloc) tr.MarkReceived() defer tr.Destroy(structs.NewTaskEvent(structs.TaskKilled)) defer tr.ctx.AllocDir.Destroy() go tr.Run() select { case <-tr.WaitCh(): case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): t.Fatalf("timeout") } if len(upd.events) != 4 { t.Fatalf("should have 4 updates: %#v", upd.events) } if upd.state != structs.TaskStateDead { t.Fatalf("TaskState %v; want %v", upd.state, structs.TaskStateDead) } if upd.events[0].Type != structs.TaskReceived { t.Fatalf("First Event was %v; want %v", upd.events[0].Type, structs.TaskReceived) } if upd.events[1].Type != structs.TaskDownloadingArtifacts { t.Fatalf("Second Event was %v; want %v", upd.events[1].Type, structs.TaskDownloadingArtifacts) } if upd.events[2].Type != structs.TaskStarted { t.Fatalf("Third Event was %v; want %v", upd.events[2].Type, structs.TaskStarted) } if upd.events[3].Type != structs.TaskTerminated { t.Fatalf("Fourth Event was %v; want %v", upd.events[3].Type, structs.TaskTerminated) } // Check that both files exist. taskDir := tr.ctx.AllocDir.TaskDirs[task.Name] if _, err := os.Stat(filepath.Join(taskDir, f1)); err != nil { t.Fatalf("%v not downloaded", f1) } if _, err := os.Stat(filepath.Join(taskDir, allocdir.TaskLocal, "test")); err != nil { t.Fatalf("template not rendered") } }
func TestTaskRunner_Template_Block(t *testing.T) { alloc := mock.Alloc() task := alloc.Job.TaskGroups[0].Tasks[0] task.Driver = "mock_driver" task.Config = map[string]interface{}{ "exit_code": "0", "run_for": "1s", } task.Templates = []*structs.Template{ { EmbeddedTmpl: "{{key \"foo\"}}", DestPath: "local/test", ChangeMode: structs.TemplateChangeModeNoop, }, } upd, tr := testTaskRunnerFromAlloc(false, alloc) tr.MarkReceived() defer tr.Destroy(structs.NewTaskEvent(structs.TaskKilled)) defer tr.ctx.AllocDir.Destroy() go tr.Run() select { case <-tr.WaitCh(): t.Fatalf("premature exit") case <-time.After(1 * time.Second): } if len(upd.events) != 1 { t.Fatalf("should have 1 updates: %#v", upd.events) } if upd.state != structs.TaskStatePending { t.Fatalf("TaskState %v; want %v", upd.state, structs.TaskStatePending) } if upd.events[0].Type != structs.TaskReceived { t.Fatalf("First Event was %v; want %v", upd.events[0].Type, structs.TaskReceived) } // Unblock tr.UnblockStart("test") select { case <-tr.WaitCh(): case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): t.Fatalf("timeout") } if len(upd.events) != 3 { t.Fatalf("should have 3 updates: %#v", upd.events) } if upd.state != structs.TaskStateDead { t.Fatalf("TaskState %v; want %v", upd.state, structs.TaskStateDead) } if upd.events[0].Type != structs.TaskReceived { t.Fatalf("First Event was %v; want %v", upd.events[0].Type, structs.TaskReceived) } if upd.events[1].Type != structs.TaskStarted { t.Fatalf("Second Event was %v; want %v", upd.events[1].Type, structs.TaskStarted) } if upd.events[2].Type != structs.TaskTerminated { t.Fatalf("Third Event was %v; want %v", upd.events[2].Type, structs.TaskTerminated) } }
// Helper function for converting a WaitResult into a TaskTerminated event. func (r *TaskRunner) waitErrorToEvent(res *cstructs.WaitResult) *structs.TaskEvent { return structs.NewTaskEvent(structs.TaskTerminated). SetExitCode(res.ExitCode). SetSignal(res.Signal). SetExitMessage(res.Err) }
func TestTaskRunner_DeriveToken_Retry(t *testing.T) { alloc := mock.Alloc() task := alloc.Job.TaskGroups[0].Tasks[0] task.Driver = "mock_driver" task.Config = map[string]interface{}{ "exit_code": "0", "run_for": "1s", } task.Vault = &structs.Vault{Policies: []string{"default"}} upd, tr := testTaskRunnerFromAlloc(false, alloc) tr.MarkReceived() defer tr.Destroy(structs.NewTaskEvent(structs.TaskKilled)) defer tr.ctx.AllocDir.Destroy() // Control when we get a Vault token token := "1234" count := 0 handler := func(*structs.Allocation, []string) (map[string]string, error) { if count > 0 { return map[string]string{task.Name: token}, nil } count++ return nil, structs.NewRecoverableError(fmt.Errorf("Want a retry"), true) } tr.vaultClient.(*vaultclient.MockVaultClient).DeriveTokenFn = handler go tr.Run() select { case <-tr.WaitCh(): case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): t.Fatalf("timeout") } if len(upd.events) != 3 { t.Fatalf("should have 3 updates: %#v", upd.events) } if upd.state != structs.TaskStateDead { t.Fatalf("TaskState %v; want %v", upd.state, structs.TaskStateDead) } if upd.events[0].Type != structs.TaskReceived { t.Fatalf("First Event was %v; want %v", upd.events[0].Type, structs.TaskReceived) } if upd.events[1].Type != structs.TaskStarted { t.Fatalf("Second Event was %v; want %v", upd.events[1].Type, structs.TaskStarted) } if upd.events[2].Type != structs.TaskTerminated { t.Fatalf("Third Event was %v; want %v", upd.events[2].Type, structs.TaskTerminated) } // Check that the token is on disk secretDir, err := tr.ctx.AllocDir.GetSecretDir(task.Name) if err != nil { t.Fatalf("failed to determine task %s secret dir: %v", err) } // Read the token from the file system tokenPath := filepath.Join(secretDir, vaultTokenFile) data, err := ioutil.ReadFile(tokenPath) if err != nil { t.Fatalf("Failed to read file: %v", err) } if act := string(data); act != token { t.Fatalf("Token didn't get written to disk properly, got %q; want %q", act, token) } }