Example #1
0
// killTask kills the running task. A killing event can optionally be passed and
// this event is used to mark the task as being killed. It provides a means to
// store extra information.
func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) {
	r.runningLock.Lock()
	running := r.running
	r.runningLock.Unlock()
	if !running {
		return
	}

	// Get the kill timeout
	timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout)

	// Build the event
	var event *structs.TaskEvent
	if killingEvent != nil {
		event = killingEvent
		event.Type = structs.TaskKilling
	} else {
		event = structs.NewTaskEvent(structs.TaskKilling)
	}
	event.SetKillTimeout(timeout)

	// Mark that we received the kill event
	r.setState(structs.TaskStateRunning, event)

	// Kill the task using an exponential backoff in-case of failures.
	destroySuccess, err := r.handleDestroy()
	if !destroySuccess {
		// We couldn't successfully destroy the resource created.
		r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err)
	}

	r.runningLock.Lock()
	r.running = false
	r.runningLock.Unlock()

	// Store that the task has been destroyed and any associated error.
	r.setState("", structs.NewTaskEvent(structs.TaskKilled).SetKillError(err))
}
Example #2
0
func (r *TaskRunner) run() {
	// Predeclare things so we can jump to the RESTART
	var handleEmpty bool
	var stopCollection chan struct{}

	for {
		// Download the task's artifacts
		if !r.artifactsDownloaded && len(r.task.Artifacts) > 0 {
			r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts))
			taskDir, ok := r.ctx.AllocDir.TaskDirs[r.task.Name]
			if !ok {
				err := fmt.Errorf("task directory couldn't be found")
				r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(err))
				r.logger.Printf("[ERR] client: task directory for alloc %q task %q couldn't be found", r.alloc.ID, r.task.Name)
				r.restartTracker.SetStartError(err)
				goto RESTART
			}

			for _, artifact := range r.task.Artifacts {
				if err := getter.GetArtifact(r.taskEnv, artifact, taskDir); err != nil {
					r.setState(structs.TaskStatePending,
						structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(err))
					r.restartTracker.SetStartError(dstructs.NewRecoverableError(err, true))
					goto RESTART
				}
			}

			r.artifactsDownloaded = true
		}

		// Start the task if not yet started or it is being forced. This logic
		// is necessary because in the case of a restore the handle already
		// exists.
		r.handleLock.Lock()
		handleEmpty = r.handle == nil
		r.handleLock.Unlock()

		if handleEmpty {
			startErr := r.startTask()
			r.restartTracker.SetStartError(startErr)
			if startErr != nil {
				r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr))
				goto RESTART
			}

			// Mark the task as started
			r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
			r.runningLock.Lock()
			r.running = true
			r.runningLock.Unlock()
		}

		if stopCollection == nil {
			stopCollection = make(chan struct{})
			go r.collectResourceUsageStats(stopCollection)
		}

		// Wait for updates
	WAIT:
		for {
			select {
			case waitRes := <-r.handle.WaitCh():
				if waitRes == nil {
					panic("nil wait")
				}

				r.runningLock.Lock()
				r.running = false
				r.runningLock.Unlock()

				// Stop collection of the task's resource usage
				close(stopCollection)

				// Log whether the task was successful or not.
				r.restartTracker.SetWaitResult(waitRes)
				r.setState(structs.TaskStateDead, r.waitErrorToEvent(waitRes))
				if !waitRes.Successful() {
					r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes)
				} else {
					r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID)
				}

				break WAIT
			case update := <-r.updateCh:
				if err := r.handleUpdate(update); err != nil {
					r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err)
				}
			case <-r.destroyCh:
				// Mark that we received the kill event
				timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout)
				r.setState(structs.TaskStateRunning,
					structs.NewTaskEvent(structs.TaskKilling).SetKillTimeout(timeout))

				// Kill the task using an exponential backoff in-case of failures.
				destroySuccess, err := r.handleDestroy()
				if !destroySuccess {
					// We couldn't successfully destroy the resource created.
					r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err)
				}

				// Stop collection of the task's resource usage
				close(stopCollection)

				// Store that the task has been destroyed and any associated error.
				r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskKilled).SetKillError(err))

				// Store the task event that provides context on the task destroy.
				if r.destroyEvent.Type != structs.TaskKilled {
					r.setState(structs.TaskStateDead, r.destroyEvent)
				}

				r.runningLock.Lock()
				r.running = false
				r.runningLock.Unlock()

				return
			}
		}

	RESTART:
		state, when := r.restartTracker.GetState()
		r.restartTracker.SetStartError(nil).SetWaitResult(nil)
		reason := r.restartTracker.GetReason()
		switch state {
		case structs.TaskNotRestarting, structs.TaskTerminated:
			r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID)
			if state == structs.TaskNotRestarting {
				r.setState(structs.TaskStateDead,
					structs.NewTaskEvent(structs.TaskNotRestarting).
						SetRestartReason(reason))
			}
			return
		case structs.TaskRestarting:
			r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when)
			r.setState(structs.TaskStatePending,
				structs.NewTaskEvent(structs.TaskRestarting).
					SetRestartDelay(when).
					SetRestartReason(reason))
		default:
			r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state)
			return
		}

		// Sleep but watch for destroy events.
		select {
		case <-time.After(when):
		case <-r.destroyCh:
		}

		// Destroyed while we were waiting to restart, so abort.
		r.destroyLock.Lock()
		destroyed := r.destroy
		r.destroyLock.Unlock()
		if destroyed {
			r.logger.Printf("[DEBUG] client: Not restarting task: %v because it has been destroyed due to: %s", r.task.Name, r.destroyEvent.Message)
			r.setState(structs.TaskStateDead, r.destroyEvent)
			return
		}

		// Clear the handle so a new driver will be created.
		r.handleLock.Lock()
		r.handle = nil
		stopCollection = nil
		r.handleLock.Unlock()
	}
}