// startTask creates the driver and starts the task. func (r *TaskRunner) startTask() error { // Create a driver driver, err := r.createDriver() if err != nil { return fmt.Errorf("failed to create driver of task '%s' for alloc '%s': %v", r.task.Name, r.alloc.ID, err) } // Start the job handle, err := driver.Start(r.ctx, r.task) if err != nil { wrapped := fmt.Errorf("failed to start task '%s' for alloc '%s': %v", r.task.Name, r.alloc.ID, err) r.logger.Printf("[INFO] client: %v", wrapped) if rerr, ok := err.(*structs.RecoverableError); ok { return structs.NewRecoverableError(wrapped, rerr.Recoverable) } return wrapped } r.handleLock.Lock() r.handle = handle r.handleLock.Unlock() return nil }
// recoverablePullError wraps the error gotten when trying to pull and image if // the error is recoverable. func (d *DockerDriver) recoverablePullError(err error, image string) error { recoverable := true if imageNotFoundMatcher.MatchString(err.Error()) { recoverable = false } return structs.NewRecoverableError(fmt.Errorf("Failed to pull `%s`: %s", image, err), recoverable) }
func TestClient_RestartTracker_StartError_Recoverable_Fail(t *testing.T) { t.Parallel() p := testPolicy(true, structs.RestartPolicyModeFail) rt := newRestartTracker(p, structs.JobTypeSystem) recErr := structs.NewRecoverableError(fmt.Errorf("foo"), true) for i := 0; i < p.Attempts; i++ { state, when := rt.SetStartError(recErr).GetState() if state != structs.TaskRestarting { t.Fatalf("NextRestart() returned %v, want %v", state, structs.TaskRestarting) } if !withinJitter(p.Delay, when) { t.Fatalf("NextRestart() returned %v; want %v+jitter", when, p.Delay) } } // Next restart should cause fail if state, _ := rt.SetStartError(recErr).GetState(); state != structs.TaskNotRestarting { t.Fatalf("NextRestart() returned %v; want %v", state, structs.TaskNotRestarting) } }
// Start starts the mock driver func (m *MockDriver) Start(ctx *ExecContext, task *structs.Task) (DriverHandle, error) { var driverConfig MockDriverConfig dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{ DecodeHook: mapstructure.StringToTimeDurationHookFunc(), WeaklyTypedInput: true, Result: &driverConfig, }) if err != nil { return nil, err } if err := dec.Decode(task.Config); err != nil { return nil, err } if driverConfig.StartErr != "" { return nil, structs.NewRecoverableError(errors.New(driverConfig.StartErr), driverConfig.StartErrRecoverable) } h := mockDriverHandle{ taskName: task.Name, runFor: driverConfig.RunFor, killAfter: driverConfig.KillAfter, killTimeout: task.KillTimeout, exitCode: driverConfig.ExitCode, exitSignal: driverConfig.ExitSignal, logger: m.logger, doneCh: make(chan struct{}), waitCh: make(chan *dstructs.WaitResult, 1), } if driverConfig.ExitErrMsg != "" { h.exitErr = errors.New(driverConfig.ExitErrMsg) } if driverConfig.SignalErr != "" { h.signalErr = fmt.Errorf(driverConfig.SignalErr) } m.logger.Printf("[DEBUG] driver.mock: starting task %q", task.Name) go h.run() return &h, nil }
// LookupToken takes a Vault token and does a lookup against Vault. The call is // rate limited and may be canceled with passed context. func (v *vaultClient) LookupToken(ctx context.Context, token string) (*vapi.Secret, error) { if !v.Enabled() { return nil, fmt.Errorf("Vault integration disabled") } if !v.Active() { return nil, fmt.Errorf("Vault client not active") } // Check if we have established a connection with Vault if established, err := v.ConnectionEstablished(); !established && err == nil { return nil, structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true) } else if !established { return nil, fmt.Errorf("Connection to Vault failed: %v", err) } // Ensure we are under our rate limit if err := v.limiter.Wait(ctx); err != nil { return nil, err } // Lookup the token return v.auth.Lookup(token) }
func TestTaskRunner_DeriveToken_Retry(t *testing.T) { alloc := mock.Alloc() task := alloc.Job.TaskGroups[0].Tasks[0] task.Driver = "mock_driver" task.Config = map[string]interface{}{ "exit_code": "0", "run_for": "1s", } task.Vault = &structs.Vault{Policies: []string{"default"}} upd, tr := testTaskRunnerFromAlloc(false, alloc) tr.MarkReceived() defer tr.Destroy(structs.NewTaskEvent(structs.TaskKilled)) defer tr.ctx.AllocDir.Destroy() // Control when we get a Vault token token := "1234" count := 0 handler := func(*structs.Allocation, []string) (map[string]string, error) { if count > 0 { return map[string]string{task.Name: token}, nil } count++ return nil, structs.NewRecoverableError(fmt.Errorf("Want a retry"), true) } tr.vaultClient.(*vaultclient.MockVaultClient).DeriveTokenFn = handler go tr.Run() select { case <-tr.WaitCh(): case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): t.Fatalf("timeout") } if len(upd.events) != 3 { t.Fatalf("should have 3 updates: %#v", upd.events) } if upd.state != structs.TaskStateDead { t.Fatalf("TaskState %v; want %v", upd.state, structs.TaskStateDead) } if upd.events[0].Type != structs.TaskReceived { t.Fatalf("First Event was %v; want %v", upd.events[0].Type, structs.TaskReceived) } if upd.events[1].Type != structs.TaskStarted { t.Fatalf("Second Event was %v; want %v", upd.events[1].Type, structs.TaskStarted) } if upd.events[2].Type != structs.TaskTerminated { t.Fatalf("Third Event was %v; want %v", upd.events[2].Type, structs.TaskTerminated) } // Check that the token is on disk secretDir, err := tr.ctx.AllocDir.GetSecretDir(task.Name) if err != nil { t.Fatalf("failed to determine task %s secret dir: %v", err) } // Read the token from the file system tokenPath := filepath.Join(secretDir, vaultTokenFile) data, err := ioutil.ReadFile(tokenPath) if err != nil { t.Fatalf("Failed to read file: %v", err) } if act := string(data); act != token { t.Fatalf("Token didn't get written to disk properly, got %q; want %q", act, token) } }
// parallelRevoke revokes the passed VaultAccessors in parallel. func (v *vaultClient) parallelRevoke(ctx context.Context, accessors []*structs.VaultAccessor) error { if !v.Enabled() { return fmt.Errorf("Vault integration disabled") } if !v.Active() { return fmt.Errorf("Vault client not active") } // Check if we have established a connection with Vault if established, err := v.ConnectionEstablished(); !established && err == nil { return structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true) } else if !established { return fmt.Errorf("Connection to Vault failed: %v", err) } g, pCtx := errgroup.WithContext(ctx) // Cap the handlers handlers := len(accessors) if handlers > maxParallelRevokes { handlers = maxParallelRevokes } // Create the Vault Tokens input := make(chan *structs.VaultAccessor, handlers) for i := 0; i < handlers; i++ { g.Go(func() error { for { select { case va, ok := <-input: if !ok { return nil } if err := v.auth.RevokeAccessor(va.Accessor); err != nil { return fmt.Errorf("failed to revoke token (alloc: %q, node: %q, task: %q): %v", va.AllocID, va.NodeID, va.Task, err) } case <-pCtx.Done(): return nil } } }) } // Send the input go func() { defer close(input) for _, va := range accessors { select { case <-pCtx.Done(): return case input <- va: } } }() // Wait for everything to complete return g.Wait() }
// CreateToken takes the allocation and task and returns an appropriate Vault // token. The call is rate limited and may be canceled with the passed policy. // When the error is recoverable, it will be of type RecoverableError func (v *vaultClient) CreateToken(ctx context.Context, a *structs.Allocation, task string) (*vapi.Secret, error) { if !v.Enabled() { return nil, fmt.Errorf("Vault integration disabled") } if !v.Active() { return nil, structs.NewRecoverableError(fmt.Errorf("Vault client not active"), true) } // Check if we have established a connection with Vault if established, err := v.ConnectionEstablished(); !established && err == nil { return nil, structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true) } else if !established { return nil, fmt.Errorf("Connection to Vault failed: %v", err) } // Retrieve the Vault block for the task policies := a.Job.VaultPolicies() if policies == nil { return nil, fmt.Errorf("Job doesn't require Vault policies") } tg, ok := policies[a.TaskGroup] if !ok { return nil, fmt.Errorf("Task group does not require Vault policies") } taskVault, ok := tg[task] if !ok { return nil, fmt.Errorf("Task does not require Vault policies") } // Build the creation request req := &vapi.TokenCreateRequest{ Policies: taskVault.Policies, Metadata: map[string]string{ "AllocationID": a.ID, "Task": task, "NodeID": a.NodeID, }, TTL: v.childTTL, DisplayName: fmt.Sprintf("%s-%s", a.ID, task), } // Ensure we are under our rate limit if err := v.limiter.Wait(ctx); err != nil { return nil, err } // Make the request and switch depending on whether we are using a root // token or a role based token var secret *vapi.Secret var err error if v.tokenData.Root { req.Period = v.childTTL secret, err = v.auth.Create(req) } else { // Make the token using the role secret, err = v.auth.CreateWithRole(req, v.tokenData.Role) } // Determine whether it is unrecoverable if err != nil { if vaultUnrecoverableError.MatchString(err.Error()) { return secret, err } // The error is recoverable return nil, structs.NewRecoverableError(err, true) } return secret, nil }
// prestart handles life-cycle tasks that occur before the task has started. func (r *TaskRunner) prestart(resultCh chan bool) { if r.task.Vault != nil { // Wait for the token r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", r.task.Name, r.alloc.ID) tokenCh := r.vaultFuture.Wait() select { case <-tokenCh: case <-r.waitCh: resultCh <- false return } r.logger.Printf("[DEBUG] client: retrieved Vault token for task %v in alloc %q", r.task.Name, r.alloc.ID) } if err := r.setTaskEnv(); err != nil { r.setState( structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) resultCh <- false return } for { // Download the task's artifacts if !r.artifactsDownloaded && len(r.task.Artifacts) > 0 { r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts)) for _, artifact := range r.task.Artifacts { if err := getter.GetArtifact(r.getTaskEnv(), artifact, r.taskDir); err != nil { wrapped := fmt.Errorf("failed to download artifact %q: %v", artifact.GetterSource, err) r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(wrapped)) r.restartTracker.SetStartError(structs.NewRecoverableError(wrapped, true)) goto RESTART } } r.artifactsDownloaded = true } // We don't have to wait for any template if len(r.task.Templates) == 0 { // Send the start signal select { case r.startCh <- struct{}{}: default: } resultCh <- true return } // Build the template manager if r.templateManager == nil { var err error r.templateManager, err = NewTaskTemplateManager(r, r.task.Templates, r.config, r.vaultFuture.Get(), r.taskDir, r.getTaskEnv()) if err != nil { err := fmt.Errorf("failed to build task's template manager: %v", err) r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err) resultCh <- false return } } // Block for consul-template // TODO Hooks should register themselves as blocking and then we can // perioidcally enumerate what we are still blocked on select { case <-r.unblockCh: // Send the start signal select { case r.startCh <- struct{}{}: default: } resultCh <- true return case <-r.waitCh: // The run loop has exited so exit too resultCh <- false return } RESTART: restart := r.shouldRestart() if !restart { resultCh <- false return } } }
// DeriveVaultToken is used by the clients to request wrapped Vault tokens for // tasks func (n *Node) DeriveVaultToken(args *structs.DeriveVaultTokenRequest, reply *structs.DeriveVaultTokenResponse) error { // setErr is a helper for setting the recoverable error on the reply and // logging it setErr := func(e error, recoverable bool) { reply.Error = structs.NewRecoverableError(e, recoverable) n.srv.logger.Printf("[ERR] nomad.client: DeriveVaultToken failed (recoverable %v): %v", recoverable, e) } if done, err := n.srv.forward("Node.DeriveVaultToken", args, args, reply); done { setErr(err, err == structs.ErrNoLeader) return nil } defer metrics.MeasureSince([]string{"nomad", "client", "derive_vault_token"}, time.Now()) // Verify the arguments if args.NodeID == "" { setErr(fmt.Errorf("missing node ID"), false) return nil } if args.SecretID == "" { setErr(fmt.Errorf("missing node SecretID"), false) return nil } if args.AllocID == "" { setErr(fmt.Errorf("missing allocation ID"), false) return nil } if len(args.Tasks) == 0 { setErr(fmt.Errorf("no tasks specified"), false) return nil } // Verify the following: // * The Node exists and has the correct SecretID // * The Allocation exists on the specified node // * The allocation contains the given tasks and they each require Vault // tokens snap, err := n.srv.fsm.State().Snapshot() if err != nil { setErr(err, false) return nil } node, err := snap.NodeByID(args.NodeID) if err != nil { setErr(err, false) return nil } if node == nil { setErr(fmt.Errorf("Node %q does not exist", args.NodeID), false) return nil } if node.SecretID != args.SecretID { setErr(fmt.Errorf("SecretID mismatch"), false) return nil } alloc, err := snap.AllocByID(args.AllocID) if err != nil { setErr(err, false) return nil } if alloc == nil { setErr(fmt.Errorf("Allocation %q does not exist", args.AllocID), false) return nil } if alloc.NodeID != args.NodeID { setErr(fmt.Errorf("Allocation %q not running on Node %q", args.AllocID, args.NodeID), false) return nil } if alloc.TerminalStatus() { setErr(fmt.Errorf("Can't request Vault token for terminal allocation"), false) return nil } // Check the policies policies := alloc.Job.VaultPolicies() if policies == nil { setErr(fmt.Errorf("Job doesn't require Vault policies"), false) return nil } tg, ok := policies[alloc.TaskGroup] if !ok { setErr(fmt.Errorf("Task group does not require Vault policies"), false) return nil } var unneeded []string for _, task := range args.Tasks { taskVault := tg[task] if taskVault == nil || len(taskVault.Policies) == 0 { unneeded = append(unneeded, task) } } if len(unneeded) != 0 { e := fmt.Errorf("Requested Vault tokens for tasks without defined Vault policies: %s", strings.Join(unneeded, ", ")) setErr(e, false) return nil } // At this point the request is valid and we should contact Vault for // tokens. // Create an error group where we will spin up a fixed set of goroutines to // handle deriving tokens but where if any fails the whole group is // canceled. g, ctx := errgroup.WithContext(context.Background()) // Cap the handlers handlers := len(args.Tasks) if handlers > maxParallelRequestsPerDerive { handlers = maxParallelRequestsPerDerive } // Create the Vault Tokens input := make(chan string, handlers) results := make(map[string]*vapi.Secret, len(args.Tasks)) for i := 0; i < handlers; i++ { g.Go(func() error { for { select { case task, ok := <-input: if !ok { return nil } secret, err := n.srv.vault.CreateToken(ctx, alloc, task) if err != nil { wrapped := fmt.Errorf("failed to create token for task %q: %v", task, err) if rerr, ok := err.(*structs.RecoverableError); ok && rerr.Recoverable { // If the error is recoverable, propogate it return structs.NewRecoverableError(wrapped, true) } return wrapped } results[task] = secret case <-ctx.Done(): return nil } } }) } // Send the input go func() { defer close(input) for _, task := range args.Tasks { select { case <-ctx.Done(): return case input <- task: } } }() // Wait for everything to complete or for an error createErr := g.Wait() // Retrieve the results accessors := make([]*structs.VaultAccessor, 0, len(results)) tokens := make(map[string]string, len(results)) for task, secret := range results { w := secret.WrapInfo if w == nil { return fmt.Errorf("Vault returned Secret without WrapInfo") } tokens[task] = w.Token accessor := &structs.VaultAccessor{ Accessor: w.WrappedAccessor, Task: task, NodeID: alloc.NodeID, AllocID: alloc.ID, CreationTTL: w.TTL, } accessors = append(accessors, accessor) } // If there was an error revoke the created tokens if createErr != nil { n.srv.logger.Printf("[ERR] nomad.node: Vault token creation failed: %v", createErr) if revokeErr := n.srv.vault.RevokeTokens(context.Background(), accessors, false); revokeErr != nil { n.srv.logger.Printf("[ERR] nomad.node: Vault token revocation failed: %v", revokeErr) } if rerr, ok := createErr.(*structs.RecoverableError); ok { reply.Error = rerr } else { reply.Error = structs.NewRecoverableError(createErr, false) } return nil } // Commit to Raft before returning any of the tokens req := structs.VaultAccessorsRequest{Accessors: accessors} _, index, err := n.srv.raftApply(structs.VaultAccessorRegisterRequestType, &req) if err != nil { n.srv.logger.Printf("[ERR] nomad.client: Register Vault accessors failed: %v", err) // Determine if we can recover from the error retry := false switch err { case raft.ErrNotLeader, raft.ErrLeadershipLost, raft.ErrRaftShutdown, raft.ErrEnqueueTimeout: retry = true } setErr(err, retry) return nil } reply.Index = index reply.Tasks = tokens n.srv.setQueryMeta(&reply.QueryMeta) return nil }
// running operations such as waiting on containers and collect stats waitClient *docker.Client // The statistics the Docker driver exposes DockerMeasuredMemStats = []string{"RSS", "Cache", "Swap", "Max Usage"} DockerMeasuredCpuStats = []string{"Throttled Periods", "Throttled Time", "Percent"} // recoverableErrTimeouts returns a recoverable error if the error was due // to timeouts recoverableErrTimeouts = func(err error) *structs.RecoverableError { r := false if strings.Contains(err.Error(), "Client.Timeout exceeded while awaiting headers") || strings.Contains(err.Error(), "EOF") { r = true } return structs.NewRecoverableError(err, r) } ) const ( // NoSuchContainerError is returned by the docker daemon if the container // does not exist. NoSuchContainerError = "No such container" // The key populated in Node Attributes to indicate presence of the Docker // driver dockerDriverAttr = "driver.docker" // dockerSELinuxLabelConfigOption is the key for configuring the // SELinux label for binds. dockerSELinuxLabelConfigOption = "docker.volumes.selinuxlabel"