Пример #1
0
// startTask creates the driver and starts the task.
func (r *TaskRunner) startTask() error {
	// Create a driver
	driver, err := r.createDriver()
	if err != nil {
		return fmt.Errorf("failed to create driver of task '%s' for alloc '%s': %v",
			r.task.Name, r.alloc.ID, err)
	}

	// Start the job
	handle, err := driver.Start(r.ctx, r.task)
	if err != nil {
		wrapped := fmt.Errorf("failed to start task '%s' for alloc '%s': %v",
			r.task.Name, r.alloc.ID, err)

		r.logger.Printf("[INFO] client: %v", wrapped)

		if rerr, ok := err.(*structs.RecoverableError); ok {
			return structs.NewRecoverableError(wrapped, rerr.Recoverable)
		}

		return wrapped

	}

	r.handleLock.Lock()
	r.handle = handle
	r.handleLock.Unlock()
	return nil
}
Пример #2
0
// recoverablePullError wraps the error gotten when trying to pull and image if
// the error is recoverable.
func (d *DockerDriver) recoverablePullError(err error, image string) error {
	recoverable := true
	if imageNotFoundMatcher.MatchString(err.Error()) {
		recoverable = false
	}
	return structs.NewRecoverableError(fmt.Errorf("Failed to pull `%s`: %s", image, err), recoverable)
}
Пример #3
0
func TestClient_RestartTracker_StartError_Recoverable_Fail(t *testing.T) {
	t.Parallel()
	p := testPolicy(true, structs.RestartPolicyModeFail)
	rt := newRestartTracker(p, structs.JobTypeSystem)
	recErr := structs.NewRecoverableError(fmt.Errorf("foo"), true)
	for i := 0; i < p.Attempts; i++ {
		state, when := rt.SetStartError(recErr).GetState()
		if state != structs.TaskRestarting {
			t.Fatalf("NextRestart() returned %v, want %v", state, structs.TaskRestarting)
		}
		if !withinJitter(p.Delay, when) {
			t.Fatalf("NextRestart() returned %v; want %v+jitter", when, p.Delay)
		}
	}

	// Next restart should cause fail
	if state, _ := rt.SetStartError(recErr).GetState(); state != structs.TaskNotRestarting {
		t.Fatalf("NextRestart() returned %v; want %v", state, structs.TaskNotRestarting)
	}
}
Пример #4
0
// Start starts the mock driver
func (m *MockDriver) Start(ctx *ExecContext, task *structs.Task) (DriverHandle, error) {
	var driverConfig MockDriverConfig
	dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{
		DecodeHook:       mapstructure.StringToTimeDurationHookFunc(),
		WeaklyTypedInput: true,
		Result:           &driverConfig,
	})
	if err != nil {
		return nil, err
	}
	if err := dec.Decode(task.Config); err != nil {
		return nil, err
	}

	if driverConfig.StartErr != "" {
		return nil, structs.NewRecoverableError(errors.New(driverConfig.StartErr), driverConfig.StartErrRecoverable)
	}

	h := mockDriverHandle{
		taskName:    task.Name,
		runFor:      driverConfig.RunFor,
		killAfter:   driverConfig.KillAfter,
		killTimeout: task.KillTimeout,
		exitCode:    driverConfig.ExitCode,
		exitSignal:  driverConfig.ExitSignal,
		logger:      m.logger,
		doneCh:      make(chan struct{}),
		waitCh:      make(chan *dstructs.WaitResult, 1),
	}
	if driverConfig.ExitErrMsg != "" {
		h.exitErr = errors.New(driverConfig.ExitErrMsg)
	}
	if driverConfig.SignalErr != "" {
		h.signalErr = fmt.Errorf(driverConfig.SignalErr)
	}
	m.logger.Printf("[DEBUG] driver.mock: starting task %q", task.Name)
	go h.run()
	return &h, nil
}
Пример #5
0
// LookupToken takes a Vault token and does a lookup against Vault. The call is
// rate limited and may be canceled with passed context.
func (v *vaultClient) LookupToken(ctx context.Context, token string) (*vapi.Secret, error) {
	if !v.Enabled() {
		return nil, fmt.Errorf("Vault integration disabled")
	}

	if !v.Active() {
		return nil, fmt.Errorf("Vault client not active")
	}

	// Check if we have established a connection with Vault
	if established, err := v.ConnectionEstablished(); !established && err == nil {
		return nil, structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true)
	} else if !established {
		return nil, fmt.Errorf("Connection to Vault failed: %v", err)
	}

	// Ensure we are under our rate limit
	if err := v.limiter.Wait(ctx); err != nil {
		return nil, err
	}

	// Lookup the token
	return v.auth.Lookup(token)
}
Пример #6
0
func TestTaskRunner_DeriveToken_Retry(t *testing.T) {
	alloc := mock.Alloc()
	task := alloc.Job.TaskGroups[0].Tasks[0]
	task.Driver = "mock_driver"
	task.Config = map[string]interface{}{
		"exit_code": "0",
		"run_for":   "1s",
	}
	task.Vault = &structs.Vault{Policies: []string{"default"}}

	upd, tr := testTaskRunnerFromAlloc(false, alloc)
	tr.MarkReceived()
	defer tr.Destroy(structs.NewTaskEvent(structs.TaskKilled))
	defer tr.ctx.AllocDir.Destroy()

	// Control when we get a Vault token
	token := "1234"
	count := 0
	handler := func(*structs.Allocation, []string) (map[string]string, error) {
		if count > 0 {
			return map[string]string{task.Name: token}, nil
		}

		count++
		return nil, structs.NewRecoverableError(fmt.Errorf("Want a retry"), true)
	}
	tr.vaultClient.(*vaultclient.MockVaultClient).DeriveTokenFn = handler
	go tr.Run()

	select {
	case <-tr.WaitCh():
	case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second):
		t.Fatalf("timeout")
	}

	if len(upd.events) != 3 {
		t.Fatalf("should have 3 updates: %#v", upd.events)
	}

	if upd.state != structs.TaskStateDead {
		t.Fatalf("TaskState %v; want %v", upd.state, structs.TaskStateDead)
	}

	if upd.events[0].Type != structs.TaskReceived {
		t.Fatalf("First Event was %v; want %v", upd.events[0].Type, structs.TaskReceived)
	}

	if upd.events[1].Type != structs.TaskStarted {
		t.Fatalf("Second Event was %v; want %v", upd.events[1].Type, structs.TaskStarted)
	}

	if upd.events[2].Type != structs.TaskTerminated {
		t.Fatalf("Third Event was %v; want %v", upd.events[2].Type, structs.TaskTerminated)
	}

	// Check that the token is on disk
	secretDir, err := tr.ctx.AllocDir.GetSecretDir(task.Name)
	if err != nil {
		t.Fatalf("failed to determine task %s secret dir: %v", err)
	}

	// Read the token from the file system
	tokenPath := filepath.Join(secretDir, vaultTokenFile)
	data, err := ioutil.ReadFile(tokenPath)
	if err != nil {
		t.Fatalf("Failed to read file: %v", err)
	}

	if act := string(data); act != token {
		t.Fatalf("Token didn't get written to disk properly, got %q; want %q", act, token)
	}
}
Пример #7
0
// parallelRevoke revokes the passed VaultAccessors in parallel.
func (v *vaultClient) parallelRevoke(ctx context.Context, accessors []*structs.VaultAccessor) error {
	if !v.Enabled() {
		return fmt.Errorf("Vault integration disabled")
	}

	if !v.Active() {
		return fmt.Errorf("Vault client not active")
	}

	// Check if we have established a connection with Vault
	if established, err := v.ConnectionEstablished(); !established && err == nil {
		return structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true)
	} else if !established {
		return fmt.Errorf("Connection to Vault failed: %v", err)
	}

	g, pCtx := errgroup.WithContext(ctx)

	// Cap the handlers
	handlers := len(accessors)
	if handlers > maxParallelRevokes {
		handlers = maxParallelRevokes
	}

	// Create the Vault Tokens
	input := make(chan *structs.VaultAccessor, handlers)
	for i := 0; i < handlers; i++ {
		g.Go(func() error {
			for {
				select {
				case va, ok := <-input:
					if !ok {
						return nil
					}

					if err := v.auth.RevokeAccessor(va.Accessor); err != nil {
						return fmt.Errorf("failed to revoke token (alloc: %q, node: %q, task: %q): %v", va.AllocID, va.NodeID, va.Task, err)
					}
				case <-pCtx.Done():
					return nil
				}
			}
		})
	}

	// Send the input
	go func() {
		defer close(input)
		for _, va := range accessors {
			select {
			case <-pCtx.Done():
				return
			case input <- va:
			}
		}

	}()

	// Wait for everything to complete
	return g.Wait()
}
Пример #8
0
// CreateToken takes the allocation and task and returns an appropriate Vault
// token. The call is rate limited and may be canceled with the passed policy.
// When the error is recoverable, it will be of type RecoverableError
func (v *vaultClient) CreateToken(ctx context.Context, a *structs.Allocation, task string) (*vapi.Secret, error) {
	if !v.Enabled() {
		return nil, fmt.Errorf("Vault integration disabled")
	}

	if !v.Active() {
		return nil, structs.NewRecoverableError(fmt.Errorf("Vault client not active"), true)
	}

	// Check if we have established a connection with Vault
	if established, err := v.ConnectionEstablished(); !established && err == nil {
		return nil, structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true)
	} else if !established {
		return nil, fmt.Errorf("Connection to Vault failed: %v", err)
	}

	// Retrieve the Vault block for the task
	policies := a.Job.VaultPolicies()
	if policies == nil {
		return nil, fmt.Errorf("Job doesn't require Vault policies")
	}
	tg, ok := policies[a.TaskGroup]
	if !ok {
		return nil, fmt.Errorf("Task group does not require Vault policies")
	}
	taskVault, ok := tg[task]
	if !ok {
		return nil, fmt.Errorf("Task does not require Vault policies")
	}

	// Build the creation request
	req := &vapi.TokenCreateRequest{
		Policies: taskVault.Policies,
		Metadata: map[string]string{
			"AllocationID": a.ID,
			"Task":         task,
			"NodeID":       a.NodeID,
		},
		TTL:         v.childTTL,
		DisplayName: fmt.Sprintf("%s-%s", a.ID, task),
	}

	// Ensure we are under our rate limit
	if err := v.limiter.Wait(ctx); err != nil {
		return nil, err
	}

	// Make the request and switch depending on whether we are using a root
	// token or a role based token
	var secret *vapi.Secret
	var err error
	if v.tokenData.Root {
		req.Period = v.childTTL
		secret, err = v.auth.Create(req)
	} else {
		// Make the token using the role
		secret, err = v.auth.CreateWithRole(req, v.tokenData.Role)
	}

	// Determine whether it is unrecoverable
	if err != nil {
		if vaultUnrecoverableError.MatchString(err.Error()) {
			return secret, err
		}

		// The error is recoverable
		return nil, structs.NewRecoverableError(err, true)
	}

	return secret, nil
}
Пример #9
0
// prestart handles life-cycle tasks that occur before the task has started.
func (r *TaskRunner) prestart(resultCh chan bool) {

	if r.task.Vault != nil {
		// Wait for the token
		r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", r.task.Name, r.alloc.ID)
		tokenCh := r.vaultFuture.Wait()
		select {
		case <-tokenCh:
		case <-r.waitCh:
			resultCh <- false
			return
		}
		r.logger.Printf("[DEBUG] client: retrieved Vault token for task %v in alloc %q", r.task.Name, r.alloc.ID)
	}

	if err := r.setTaskEnv(); err != nil {
		r.setState(
			structs.TaskStateDead,
			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
		resultCh <- false
		return
	}

	for {
		// Download the task's artifacts
		if !r.artifactsDownloaded && len(r.task.Artifacts) > 0 {
			r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts))
			for _, artifact := range r.task.Artifacts {
				if err := getter.GetArtifact(r.getTaskEnv(), artifact, r.taskDir); err != nil {
					wrapped := fmt.Errorf("failed to download artifact %q: %v", artifact.GetterSource, err)
					r.setState(structs.TaskStatePending,
						structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(wrapped))
					r.restartTracker.SetStartError(structs.NewRecoverableError(wrapped, true))
					goto RESTART
				}
			}

			r.artifactsDownloaded = true
		}

		// We don't have to wait for any template
		if len(r.task.Templates) == 0 {
			// Send the start signal
			select {
			case r.startCh <- struct{}{}:
			default:
			}

			resultCh <- true
			return
		}

		// Build the template manager
		if r.templateManager == nil {
			var err error
			r.templateManager, err = NewTaskTemplateManager(r, r.task.Templates,
				r.config, r.vaultFuture.Get(), r.taskDir, r.getTaskEnv())
			if err != nil {
				err := fmt.Errorf("failed to build task's template manager: %v", err)
				r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
				r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err)
				resultCh <- false
				return
			}
		}

		// Block for consul-template
		// TODO Hooks should register themselves as blocking and then we can
		// perioidcally enumerate what we are still blocked on
		select {
		case <-r.unblockCh:
			// Send the start signal
			select {
			case r.startCh <- struct{}{}:
			default:
			}

			resultCh <- true
			return
		case <-r.waitCh:
			// The run loop has exited so exit too
			resultCh <- false
			return
		}

	RESTART:
		restart := r.shouldRestart()
		if !restart {
			resultCh <- false
			return
		}
	}
}
Пример #10
0
// DeriveVaultToken is used by the clients to request wrapped Vault tokens for
// tasks
func (n *Node) DeriveVaultToken(args *structs.DeriveVaultTokenRequest,
	reply *structs.DeriveVaultTokenResponse) error {

	// setErr is a helper for setting the recoverable error on the reply and
	// logging it
	setErr := func(e error, recoverable bool) {
		reply.Error = structs.NewRecoverableError(e, recoverable)
		n.srv.logger.Printf("[ERR] nomad.client: DeriveVaultToken failed (recoverable %v): %v", recoverable, e)
	}

	if done, err := n.srv.forward("Node.DeriveVaultToken", args, args, reply); done {
		setErr(err, err == structs.ErrNoLeader)
		return nil
	}
	defer metrics.MeasureSince([]string{"nomad", "client", "derive_vault_token"}, time.Now())

	// Verify the arguments
	if args.NodeID == "" {
		setErr(fmt.Errorf("missing node ID"), false)
		return nil
	}
	if args.SecretID == "" {
		setErr(fmt.Errorf("missing node SecretID"), false)
		return nil
	}
	if args.AllocID == "" {
		setErr(fmt.Errorf("missing allocation ID"), false)
		return nil
	}
	if len(args.Tasks) == 0 {
		setErr(fmt.Errorf("no tasks specified"), false)
		return nil
	}

	// Verify the following:
	// * The Node exists and has the correct SecretID
	// * The Allocation exists on the specified node
	// * The allocation contains the given tasks and they each require Vault
	//   tokens
	snap, err := n.srv.fsm.State().Snapshot()
	if err != nil {
		setErr(err, false)
		return nil
	}
	node, err := snap.NodeByID(args.NodeID)
	if err != nil {
		setErr(err, false)
		return nil
	}
	if node == nil {
		setErr(fmt.Errorf("Node %q does not exist", args.NodeID), false)
		return nil
	}
	if node.SecretID != args.SecretID {
		setErr(fmt.Errorf("SecretID mismatch"), false)
		return nil
	}

	alloc, err := snap.AllocByID(args.AllocID)
	if err != nil {
		setErr(err, false)
		return nil
	}
	if alloc == nil {
		setErr(fmt.Errorf("Allocation %q does not exist", args.AllocID), false)
		return nil
	}
	if alloc.NodeID != args.NodeID {
		setErr(fmt.Errorf("Allocation %q not running on Node %q", args.AllocID, args.NodeID), false)
		return nil
	}
	if alloc.TerminalStatus() {
		setErr(fmt.Errorf("Can't request Vault token for terminal allocation"), false)
		return nil
	}

	// Check the policies
	policies := alloc.Job.VaultPolicies()
	if policies == nil {
		setErr(fmt.Errorf("Job doesn't require Vault policies"), false)
		return nil
	}
	tg, ok := policies[alloc.TaskGroup]
	if !ok {
		setErr(fmt.Errorf("Task group does not require Vault policies"), false)
		return nil
	}

	var unneeded []string
	for _, task := range args.Tasks {
		taskVault := tg[task]
		if taskVault == nil || len(taskVault.Policies) == 0 {
			unneeded = append(unneeded, task)
		}
	}

	if len(unneeded) != 0 {
		e := fmt.Errorf("Requested Vault tokens for tasks without defined Vault policies: %s",
			strings.Join(unneeded, ", "))
		setErr(e, false)
		return nil
	}

	// At this point the request is valid and we should contact Vault for
	// tokens.

	// Create an error group where we will spin up a fixed set of goroutines to
	// handle deriving tokens but where if any fails the whole group is
	// canceled.
	g, ctx := errgroup.WithContext(context.Background())

	// Cap the handlers
	handlers := len(args.Tasks)
	if handlers > maxParallelRequestsPerDerive {
		handlers = maxParallelRequestsPerDerive
	}

	// Create the Vault Tokens
	input := make(chan string, handlers)
	results := make(map[string]*vapi.Secret, len(args.Tasks))
	for i := 0; i < handlers; i++ {
		g.Go(func() error {
			for {
				select {
				case task, ok := <-input:
					if !ok {
						return nil
					}

					secret, err := n.srv.vault.CreateToken(ctx, alloc, task)
					if err != nil {
						wrapped := fmt.Errorf("failed to create token for task %q: %v", task, err)
						if rerr, ok := err.(*structs.RecoverableError); ok && rerr.Recoverable {
							// If the error is recoverable, propogate it
							return structs.NewRecoverableError(wrapped, true)
						}

						return wrapped
					}

					results[task] = secret
				case <-ctx.Done():
					return nil
				}
			}
		})
	}

	// Send the input
	go func() {
		defer close(input)
		for _, task := range args.Tasks {
			select {
			case <-ctx.Done():
				return
			case input <- task:
			}
		}

	}()

	// Wait for everything to complete or for an error
	createErr := g.Wait()

	// Retrieve the results
	accessors := make([]*structs.VaultAccessor, 0, len(results))
	tokens := make(map[string]string, len(results))
	for task, secret := range results {
		w := secret.WrapInfo
		if w == nil {
			return fmt.Errorf("Vault returned Secret without WrapInfo")
		}

		tokens[task] = w.Token
		accessor := &structs.VaultAccessor{
			Accessor:    w.WrappedAccessor,
			Task:        task,
			NodeID:      alloc.NodeID,
			AllocID:     alloc.ID,
			CreationTTL: w.TTL,
		}

		accessors = append(accessors, accessor)
	}

	// If there was an error revoke the created tokens
	if createErr != nil {
		n.srv.logger.Printf("[ERR] nomad.node: Vault token creation failed: %v", createErr)

		if revokeErr := n.srv.vault.RevokeTokens(context.Background(), accessors, false); revokeErr != nil {
			n.srv.logger.Printf("[ERR] nomad.node: Vault token revocation failed: %v", revokeErr)
		}

		if rerr, ok := createErr.(*structs.RecoverableError); ok {
			reply.Error = rerr
		} else {
			reply.Error = structs.NewRecoverableError(createErr, false)
		}

		return nil
	}

	// Commit to Raft before returning any of the tokens
	req := structs.VaultAccessorsRequest{Accessors: accessors}
	_, index, err := n.srv.raftApply(structs.VaultAccessorRegisterRequestType, &req)
	if err != nil {
		n.srv.logger.Printf("[ERR] nomad.client: Register Vault accessors failed: %v", err)

		// Determine if we can recover from the error
		retry := false
		switch err {
		case raft.ErrNotLeader, raft.ErrLeadershipLost, raft.ErrRaftShutdown, raft.ErrEnqueueTimeout:
			retry = true
		}

		setErr(err, retry)
		return nil
	}

	reply.Index = index
	reply.Tasks = tokens
	n.srv.setQueryMeta(&reply.QueryMeta)
	return nil
}
Пример #11
0
	// running operations such as waiting on containers and collect stats
	waitClient *docker.Client

	// The statistics the Docker driver exposes
	DockerMeasuredMemStats = []string{"RSS", "Cache", "Swap", "Max Usage"}
	DockerMeasuredCpuStats = []string{"Throttled Periods", "Throttled Time", "Percent"}

	// recoverableErrTimeouts returns a recoverable error if the error was due
	// to timeouts
	recoverableErrTimeouts = func(err error) *structs.RecoverableError {
		r := false
		if strings.Contains(err.Error(), "Client.Timeout exceeded while awaiting headers") ||
			strings.Contains(err.Error(), "EOF") {
			r = true
		}
		return structs.NewRecoverableError(err, r)
	}
)

const (
	// NoSuchContainerError is returned by the docker daemon if the container
	// does not exist.
	NoSuchContainerError = "No such container"

	// The key populated in Node Attributes to indicate presence of the Docker
	// driver
	dockerDriverAttr = "driver.docker"

	// dockerSELinuxLabelConfigOption is the key for configuring the
	// SELinux label for binds.
	dockerSELinuxLabelConfigOption = "docker.volumes.selinuxlabel"