// Starts the job in the Buildkite Agent API. We'll retry on connection-related // issues, but if a connection succeeds and we get an error response back from // Buildkite, we won't bother retrying. For example, a "no such host" will // retry, but a 422 from Buildkite won't. func (r *JobRunner) startJob(startedAt time.Time) error { r.Job.StartedAt = startedAt.UTC().Format(time.RFC3339Nano) return retry.Do(func(s *retry.Stats) error { _, err := r.APIClient.Jobs.Start(r.Job) if err != nil { if api.IsRetryableError(err) { logger.Warn("%s (%s)", err, s) } else { logger.Warn("Buildkite rejected the call to start the job (%s)", err) s.Break() } } return err }, &retry.Config{Maximum: 30, Interval: 5 * time.Second}) }
// Performs a ping, which returns what action the agent should take next. func (a *AgentWorker) Ping() { // Update the proc title a.UpdateProcTitle("pinging") ping, _, err := a.APIClient.Pings.Get() if err != nil { // If a ping fails, we don't really care, because it'll // ping again after the interval. logger.Warn("Failed to ping: %s", err) return } // Should we switch endpoints? if ping.Endpoint != "" && ping.Endpoint != a.Agent.Endpoint { // Before switching to the new one, do a ping test to make sure it's // valid. If it is, switch and carry on, otherwise ignore the switch // for now. newAPIClient := APIClient{Endpoint: ping.Endpoint, Token: a.Agent.AccessToken}.Create() newPing, _, err := newAPIClient.Pings.Get() if err != nil { logger.Warn("Failed to ping the new endpoint %s - ignoring switch for now (%s)", ping.Endpoint, err) } else { // Replace the APIClient and process the new ping a.APIClient = newAPIClient a.Agent.Endpoint = ping.Endpoint ping = newPing } } // Is there a message that should be shown in the logs? if ping.Message != "" { logger.Info(ping.Message) } // Should the agent disconnect? if ping.Action == "disconnect" { a.Stop(false) return } // If we don't have a job, there's nothing to do! if ping.Job == nil { // Update the proc title a.UpdateProcTitle("idle") return } // Update the proc title a.UpdateProcTitle(fmt.Sprintf("job %s", strings.Split(ping.Job.ID, "-")[0])) logger.Info("Assigned job %s. Accepting...", ping.Job.ID) // Accept the job. We'll retry on connection related issues, but if // Buildkite returns a 422 or 500 for example, we'll just bail out, // re-ping, and try the whole process again. var accepted *api.Job retry.Do(func(s *retry.Stats) error { accepted, _, err = a.APIClient.Jobs.Accept(ping.Job) if err != nil { if api.IsRetryableError(err) { logger.Warn("%s (%s)", err, s) } else { logger.Warn("Buildkite rejected the call to accept the job (%s)", err) s.Break() } } return err }, &retry.Config{Maximum: 30, Interval: 1 * time.Second}) // If `accepted` is nil, then the job was never accepted if accepted == nil { logger.Error("Failed to accept job") return } // Now that the job has been accepted, we can start it. a.jobRunner, err = JobRunner{ Endpoint: accepted.Endpoint, Agent: a.Agent, AgentConfiguration: a.AgentConfiguration, Job: accepted, }.Create() // Was there an error creating the job runner? if err != nil { logger.Error("Failed to initialize job: %s", err) return } // Start running the job if err = a.jobRunner.Run(); err != nil { logger.Error("Failed to run job: %s", err) } // No more job, no more runner. a.jobRunner = nil }