// Finishes the job in the Buildkite Agent API. This call will keep on retrying // forever until it finally gets a successfull response from the API. func (r *JobRunner) finishJob(finishedAt time.Time, exitStatus string, failedChunkCount int) error { r.Job.FinishedAt = finishedAt.UTC().Format(time.RFC3339Nano) r.Job.ExitStatus = exitStatus r.Job.ChunksFailedCount = failedChunkCount return retry.Do(func(s *retry.Stats) error { response, err := r.APIClient.Jobs.Finish(r.Job) if err != nil { // If the API returns with a 422, that means that we // succesfully tried to finish the job, but Buildkite // rejected the finish for some reason. This can // sometimes mean that Buildkite has cancelled the job // before we get a chance to send the final API call // (maybe this agent took too long to kill the // process). In that case, we don't want to keep trying // to finish the job forever so we'll just bail out and // go find some more work to do. if response != nil && response.StatusCode == 422 { logger.Warn("Buildkite rejected the call to finish the job (%s)", err) s.Break() } else { logger.Warn("%s (%s)", err, s) } } return err }, &retry.Config{Forever: true, Interval: 1 * time.Second}) }
// Takes the options passed to the CLI, and creates an api.Agent record that // will be sent to the Buildkite Agent API for registration. func (r *AgentPool) CreateAgentTemplate() *api.Agent { agent := &api.Agent{ Name: r.Name, Priority: r.Priority, MetaData: r.MetaData, ScriptEvalEnabled: r.AgentConfiguration.CommandEval, Version: Version(), Build: BuildVersion(), PID: os.Getpid(), Arch: runtime.GOARCH, } // Attempt to add the EC2 meta-data if r.MetaDataEC2 { tags, err := EC2MetaData{}.Get() if err != nil { // Don't blow up if we can't find them, just show a nasty error. logger.Error(fmt.Sprintf("Failed to fetch EC2 meta-data: %s", err.Error())) } else { for tag, value := range tags { agent.MetaData = append(agent.MetaData, fmt.Sprintf("%s=%s", tag, value)) } } } // Attempt to add the EC2 tags if r.MetaDataEC2Tags { tags, err := EC2Tags{}.Get() if err != nil { // Don't blow up if we can't find them, just show a nasty error. logger.Error(fmt.Sprintf("Failed to find EC2 Tags: %s", err.Error())) } else { for tag, value := range tags { agent.MetaData = append(agent.MetaData, fmt.Sprintf("%s=%s", tag, value)) } } } var err error // Add the hostname agent.Hostname, err = os.Hostname() if err != nil { logger.Warn("Failed to find hostname: %s", err) } // Add the OS dump agent.OS, err = system.VersionDump() if err != nil { logger.Warn("Failed to find OS information: %s", err) } return agent }
// Disconnects the agent from the Buildkite Agent API, doesn't bother retrying // because we want to disconnect as fast as possible. func (a *AgentWorker) Disconnect() error { _, err := a.APIClient.Agents.Disconnect() if err != nil { logger.Warn("There was an error sending the disconnect API call to Buildkite. If this agent still appears online, you may have to manually stop it (%s)", err) } return err }
// Starts the job in the Buildkite Agent API. We'll retry on connection-related // issues, but if a connection succeeds and we get an error response back from // Buildkite, we won't bother retrying. For example, a "no such host" will // retry, but a 422 from Buildkite won't. func (r *JobRunner) startJob(startedAt time.Time) error { r.Job.StartedAt = startedAt.UTC().Format(time.RFC3339Nano) return retry.Do(func(s *retry.Stats) error { _, err := r.APIClient.Jobs.Start(r.Job) if err != nil { if api.IsRetryableError(err) { logger.Warn("%s (%s)", err, s) } else { logger.Warn("Buildkite rejected the call to start the job (%s)", err) s.Break() } } return err }, &retry.Config{Maximum: 30, Interval: 5 * time.Second}) }
func (d Download) Start() error { return retry.Do(func(s *retry.Stats) error { err := d.try() if err != nil { logger.Warn("Error trying to download %s (%s) %s", d.URL, err, s) } return err }, &retry.Config{Maximum: d.Retries, Interval: 1 * time.Second}) }
// Runs the job func (r *JobRunner) Run() error { logger.Info("Starting job %s", r.Job.ID) // Start the build in the Buildkite Agent API. This is the first thing // we do so if it fails, we don't have to worry about cleaning things // up like started log streamer workers, etc. if err := r.startJob(time.Now()); err != nil { return err } // Start the header time streamer if err := r.headerTimesStreamer.Start(); err != nil { return err } // Start the log streamer if err := r.logStreamer.Start(); err != nil { return err } // Start the process. This will block until it finishes. if err := r.process.Start(); err != nil { // Send the error as output r.logStreamer.Process(fmt.Sprintf("%s", err)) } else { // Add the final output to the streamer r.logStreamer.Process(r.process.Output()) } // Store the finished at time finishedAt := time.Now() // Stop the header time streamer. This will block until all the chunks // have been uploaded r.headerTimesStreamer.Stop() // Stop the log streamer. This will block until all the chunks have // been uploaded r.logStreamer.Stop() // Warn about failed chunks if r.logStreamer.ChunksFailedCount > 0 { logger.Warn("%d chunks failed to upload for this job", r.logStreamer.ChunksFailedCount) } // Finish the build in the Buildkite Agent API r.finishJob(finishedAt, r.process.ExitStatus, int(r.logStreamer.ChunksFailedCount)) // Wait for the routines that we spun up to finish logger.Debug("[JobRunner] Waiting for all other routines to finish") r.routineWaitGroup.Wait() logger.Info("Finished job %s", r.Job.ID) return nil }
func (a *ArtifactBatchCreator) Create() ([]*api.Artifact, error) { length := len(a.Artifacts) chunks := 30 // Split into the artifacts into chunks so we're not uploading a ton of // files at once. for i := 0; i < length; i += chunks { j := i + chunks if length < j { j = length } // The artifacts that will be uploaded in this chunk theseArtiacts := a.Artifacts[i:j] // An ID is required so Buildkite can ensure this create // operation is idompotent (if we try and upload the same ID // twice, it'll just return the previous data and skip the // upload) batch := &api.ArtifactBatch{api.NewUUID(), theseArtiacts, a.UploadDestination} logger.Info("Creating (%d-%d)/%d artifacts", i, j, length) var creation *api.ArtifactBatchCreateResponse var resp *api.Response var err error // Retry the batch upload a couple of times err = retry.Do(func(s *retry.Stats) error { creation, resp, err = a.APIClient.Artifacts.Create(a.JobID, batch) if resp != nil && (resp.StatusCode == 401 || resp.StatusCode == 404 || resp.StatusCode == 500) { s.Break() } if err != nil { logger.Warn("%s (%s)", err, s) } return err }, &retry.Config{Maximum: 10, Interval: 1 * time.Second}) // Did the batch creation eventually fail? if err != nil { return nil, err } // Save the id and instructions to each artifact index := 0 for _, id := range creation.ArtifactIDs { theseArtiacts[index].ID = id theseArtiacts[index].UploadInstructions = creation.UploadInstructions index += 1 } } return a.Artifacts, nil }
// Connects the agent to the Buildkite Agent API, retrying up to 30 times if it // fails. func (a *AgentWorker) Connect() error { return retry.Do(func(s *retry.Stats) error { _, err := a.APIClient.Agents.Connect() if err != nil { logger.Warn("%s (%s)", err, s) } return err }, &retry.Config{Maximum: 10, Interval: 1 * time.Second}) }
func (r *JobRunner) onUploadHeaderTime(cursor int, total int, times map[string]string) { retry.Do(func(s *retry.Stats) error { _, err := r.APIClient.HeaderTimes.Save(r.Job.ID, &api.HeaderTimes{Times: times}) if err != nil { logger.Warn("%s (%s)", err, s) } return err }, &retry.Config{Maximum: 10, Interval: 5 * time.Second}) }
// Call when a chunk is ready for upload. It retry the chunk upload with an // interval before giving up. func (r *JobRunner) onUploadChunk(chunk *LogStreamerChunk) error { return retry.Do(func(s *retry.Stats) error { _, err := r.APIClient.Chunks.Upload(r.Job.ID, &api.Chunk{ Data: chunk.Data, Sequence: chunk.Order, }) if err != nil { logger.Warn("%s (%s)", err, s) } return err }, &retry.Config{Maximum: 10, Interval: 1 * time.Second}) }
// Connects the agent to the Buildkite Agent API, retrying up to 30 times if it // fails. func (a *AgentWorker) Connect() error { // Update the proc title a.UpdateProcTitle("connecting") return retry.Do(func(s *retry.Stats) error { _, err := a.APIClient.Agents.Connect() if err != nil { logger.Warn("%s (%s)", err, s) } return err }, &retry.Config{Maximum: 10, Interval: 1 * time.Second}) }
// Takes the agent template and returns a registered agent. The registered // agent includes the Access Token used to communicate with the Buildkite Agent // API func (r *AgentPool) RegisterAgent(agent *api.Agent) (*api.Agent, error) { var registered *api.Agent var err error var resp *api.Response register := func(s *retry.Stats) error { registered, resp, err = r.APIClient.Agents.Register(agent) if err != nil { if resp != nil && resp.StatusCode == 401 { logger.Warn("Buildkite rejected the registration (%s)", err) s.Break() } else { logger.Warn("%s (%s)", err, s) } } return err } err = retry.Do(register, &retry.Config{Maximum: 30, Interval: 1 * time.Second}) return registered, err }
// Stops the agent from accepting new work and cancels any current work it's // running func (a *AgentWorker) Stop(graceful bool) { // Only allow one stop to run at a time (because we're playing with // channels) a.stopMutex.Lock() defer a.stopMutex.Unlock() if graceful { if a.stopping { logger.Warn("Agent is already gracefully stopping...") } else { // If we have a job, tell the user that we'll wait for // it to finish before disconnecting if a.jobRunner != nil { logger.Info("Gracefully stopping agent. Waiting for current job to finish before disconnecting...") } else { logger.Info("Gracefully stopping agent. Since there is no job running, the agent will disconnect immediately") } } } else { // If there's a job running, kill it, then disconnect if a.jobRunner != nil { logger.Info("Forefully stopping agent. The current job will be canceled before disconnecting...") // Kill the current job. Doesn't do anything if the job // is already being killed, so it's safe to call // multiple times. a.jobRunner.Kill() } else { logger.Info("Forefully stopping agent. Since there is no job running, the agent will disconnect immediately") } } // We don't need to do the below operations again since we've already // done them before if a.stopping { return } // Update the proc title a.UpdateProcTitle("stopping") // If we have a ticker, stop it, and send a signal to the stop channel, // which will cause the agent worker to stop looping immediatly. if a.ticker != nil { close(a.stop) } // Mark the agent as stopping a.stopping = true }
func (r *JobRunner) onProcessStartCallback() { // Start a routine that will grab the output every few seconds and send // it back to Buildkite go func() { // Add to the wait group r.wg.Add(1) for r.process.Running { // Send the output of the process to the log streamer // for processing r.logStreamer.Process(r.process.Output()) // Check the output in another second time.Sleep(1 * time.Second) } // Mark this routine as done in the wait group r.wg.Done() logger.Debug("Routine that processes the log has finished") }() // Start a routine that will grab the output every few seconds and send it back to Buildkite go func() { // Add to the wait group r.wg.Add(1) for r.process.Running { // Re-get the job and check it's status to see if it's been // cancelled jobState, _, err := r.APIClient.Jobs.GetState(r.Job.ID) if err != nil { // We don't really care if it fails, we'll just // try again in a second anyway logger.Warn("Problem with getting job state %s (%s)", r.Job.ID, err) } else if jobState.State == "canceling" || jobState.State == "canceled" { r.Kill() } // Check for cancellations every few seconds time.Sleep(3 * time.Second) } // Mark this routine as done in the wait group r.wg.Done() logger.Debug("Routine that refreshes the job has finished") }() }
// Performs a heatbeat func (a *AgentWorker) Heartbeat() error { var beat *api.Heartbeat var err error // Retry the heartbeat a few times err = retry.Do(func(s *retry.Stats) error { beat, _, err = a.APIClient.Heartbeats.Beat() if err != nil { logger.Warn("%s (%s)", err, s) } return err }, &retry.Config{Maximum: 5, Interval: 1 * time.Second}) if err != nil { return err } logger.Debug("Heartbeat sent at %s and received at %s", beat.SentAt, beat.ReceivedAt) return nil }
func (a *ArtifactBatchCreator) Create() ([]*api.Artifact, error) { length := len(a.Artifacts) chunks := 10 uploaded := []*api.Artifact{} // Split into the artifacts into chunks so we're not uploading a ton of // files at once. for i := 0; i < length; i += chunks { j := i + chunks if length < j { j = length } artifacts := a.Artifacts[i:j] logger.Info("Creating (%d-%d)/%d artifacts", i, j, length) var u []*api.Artifact var err error // Retry the batch upload a couple of times err = retry.Do(func(s *retry.Stats) error { u, _, err = a.APIClient.Artifacts.Create(a.JobID, artifacts) if err != nil { logger.Warn("%s (%s)", err, s) } return err }, &retry.Config{Maximum: 10, Interval: 1 * time.Second}) if err != nil { return nil, err } uploaded = append(uploaded, u...) } return uploaded, nil }
func (d Download) Start() error { seconds := 5 * time.Second ticker := time.NewTicker(seconds) retries := 1 max := d.Retries for { err := d.try() if err == nil { break } if retries >= max { break } else { logger.Warn("Error trying to download %s (%d/%d) (%T: %v) Trying again in %s", d.URL, retries, max, err, err, seconds) } retries++ <-ticker.C } return nil }
// Performs a ping, which returns what action the agent should take next. func (a *AgentWorker) Ping() { // Update the proc title a.UpdateProcTitle("pinging") ping, _, err := a.APIClient.Pings.Get() if err != nil { // If a ping fails, we don't really care, because it'll // ping again after the interval. logger.Warn("Failed to ping: %s", err) return } // Should we switch endpoints? if ping.Endpoint != "" && ping.Endpoint != a.Agent.Endpoint { // Before switching to the new one, do a ping test to make sure it's // valid. If it is, switch and carry on, otherwise ignore the switch // for now. newAPIClient := APIClient{Endpoint: ping.Endpoint, Token: a.Agent.AccessToken}.Create() newPing, _, err := newAPIClient.Pings.Get() if err != nil { logger.Warn("Failed to ping the new endpoint %s - ignoring switch for now (%s)", ping.Endpoint, err) } else { // Replace the APIClient and process the new ping a.APIClient = newAPIClient a.Agent.Endpoint = ping.Endpoint ping = newPing } } // Is there a message that should be shown in the logs? if ping.Message != "" { logger.Info(ping.Message) } // Should the agent disconnect? if ping.Action == "disconnect" { a.Stop(false) return } // If we don't have a job, there's nothing to do! if ping.Job == nil { // Update the proc title a.UpdateProcTitle("idle") return } // Update the proc title a.UpdateProcTitle(fmt.Sprintf("job %s", strings.Split(ping.Job.ID, "-")[0])) logger.Info("Assigned job %s. Accepting...", ping.Job.ID) // Accept the job. We'll retry on connection related issues, but if // Buildkite returns a 422 or 500 for example, we'll just bail out, // re-ping, and try the whole process again. var accepted *api.Job retry.Do(func(s *retry.Stats) error { accepted, _, err = a.APIClient.Jobs.Accept(ping.Job) if err != nil { if api.IsRetryableError(err) { logger.Warn("%s (%s)", err, s) } else { logger.Warn("Buildkite rejected the call to accept the job (%s)", err) s.Break() } } return err }, &retry.Config{Maximum: 30, Interval: 1 * time.Second}) // If `accepted` is nil, then the job was never accepted if accepted == nil { logger.Error("Failed to accept job") return } // Now that the job has been accepted, we can start it. a.jobRunner, err = JobRunner{ Endpoint: accepted.Endpoint, Agent: a.Agent, AgentConfiguration: a.AgentConfiguration, Job: accepted, }.Create() // Was there an error creating the job runner? if err != nil { logger.Error("Failed to initialize job: %s", err) return } // Start running the job if err = a.jobRunner.Run(); err != nil { logger.Error("Failed to run job: %s", err) } // No more job, no more runner. a.jobRunner = nil }
func (a *ArtifactUploader) upload(artifacts []*api.Artifact) error { var uploader Uploader // Determine what uploader to use if a.Destination != "" { if strings.HasPrefix(a.Destination, "s3://") { uploader = new(S3Uploader) } else { return errors.New("Unknown upload destination: " + a.Destination) } } else { uploader = new(FormUploader) } // Setup the uploader err := uploader.Setup(a.Destination) if err != nil { return err } // Set the URL's of the artifacts based on the uploader for _, artifact := range artifacts { artifact.URL = uploader.URL(artifact) } // Create the artifacts on Buildkite batchCreator := ArtifactBatchCreator{ APIClient: a.APIClient, JobID: a.JobID, Artifacts: artifacts, } artifacts, err = batchCreator.Create() if err != nil { return err } p := pool.New(pool.MaxConcurrencyLimit) errors := []error{} for _, artifact := range artifacts { // Create new instance of the artifact for the goroutine // See: http://golang.org/doc/effective_go.html#channels artifact := artifact p.Spawn(func() { // Show a nice message that we're starting to upload the file logger.Info("Uploading \"%s\" %d bytes", artifact.Path, artifact.FileSize) // Upload the artifact and then set the state depending // on whether or not it passed. We'll retry the upload // a couple of times before giving up. err = retry.Do(func(s *retry.Stats) error { err := uploader.Upload(artifact) if err != nil { logger.Warn("%s (%s)", err, s) } return err }, &retry.Config{Maximum: 10, Interval: 1 * time.Second}) if err != nil { artifact.State = "error" logger.Error("Error uploading artifact \"%s\": %s", artifact.Path, err) // Track the error that was raised p.Lock() errors = append(errors, err) p.Unlock() } else { artifact.State = "finished" } // Update the state of the artifact on Buildkite, we // retry this as well. err = retry.Do(func(s *retry.Stats) error { _, _, err = a.APIClient.Artifacts.Update(a.JobID, artifact) if err != nil { logger.Warn("%s (%s)", err, s) } return err }, &retry.Config{Maximum: 10, Interval: 1 * time.Second}) if err != nil { logger.Error("Error marking artifact %s as uploaded: %s", artifact.Path, err) // Track the error that was raised p.Lock() errors = append(errors, err) p.Unlock() } }) } p.Wait() if len(errors) > 0 { logger.Fatal("There were errors with uploading some of the artifacts") } return nil }
} } // Create the API client client := agent.APIClient{ Endpoint: cfg.Endpoint, Token: cfg.AgentAccessToken, }.Create() // Generate a UUID that will identifiy this pipeline change. We // do this outside of the retry loop because we want this UUID // to be the same for each attempt at updating the pipeline. uuid := api.NewUUID() // Retry the pipeline upload a few times before giving up err = retry.Do(func(s *retry.Stats) error { _, err = client.Pipelines.Upload(cfg.Job, &api.Pipeline{UUID: uuid, Data: input, FileName: filename, Replace: cfg.Replace}) if err != nil { logger.Warn("%s (%s)", err, s) } return err }, &retry.Config{Maximum: 5, Interval: 1 * time.Second}) if err != nil { logger.Fatal("Failed to upload and process pipeline: %s", err) } logger.Info("Successfully uploaded and parsed pipeline config") }, }
} } // If more than 1 of the config files exist, throw an // error. There can only be one!! if len(exists) > 1 { logger.Fatal("Found multiple configuration files: %s. Please only have 1 configuration file present.", strings.Join(exists, ", ")) } else if len(exists) == 0 { logger.Fatal("Could not find a default pipeline configuration file. See `buildkite-agent pipeline upload --help` for more information.") } found := exists[0] // Warn about the deprecated steps.json if found == ".buildkite/steps.json" { logger.Warn("The default steps.json file has been deprecated and will be removed in v2.2. Please rename to .buildkite/pipeline.json and wrap the steps array in a `steps` property: { \"steps\": [ ... ] } }") } // Read the default file filename = path.Base(found) input, err = ioutil.ReadFile(found) if err != nil { logger.Fatal("Failed to read file %s: %s", found, err) } } // Create the API client client := agent.APIClient{ Endpoint: cfg.Endpoint, Token: cfg.AgentAccessToken, }.Create()
func (a *ArtifactUploader) upload(artifacts []*api.Artifact) error { var uploader Uploader // Determine what uploader to use if a.Destination != "" { if strings.HasPrefix(a.Destination, "s3://") { uploader = new(S3Uploader) } else { return errors.New("Unknown upload destination: " + a.Destination) } } else { uploader = new(FormUploader) } // Setup the uploader err := uploader.Setup(a.Destination, a.APIClient.DebugHTTP) if err != nil { return err } // Set the URL's of the artifacts based on the uploader for _, artifact := range artifacts { artifact.URL = uploader.URL(artifact) } // Create the artifacts on Buildkite batchCreator := ArtifactBatchCreator{ APIClient: a.APIClient, JobID: a.JobID, Artifacts: artifacts, UploadDestination: a.Destination, } artifacts, err = batchCreator.Create() if err != nil { return err } // Prepare a concurrency pool to upload the artifacts p := pool.New(pool.MaxConcurrencyLimit) errors := []error{} // Create a wait group so we can make sure the uploader waits for all // the artifact states to upload before finishing var stateUploaderWaitGroup sync.WaitGroup stateUploaderWaitGroup.Add(1) // A map to keep track of artifact states and how many we've uploaded artifactsStates := make(map[string]string) artifactStatesUploaded := 0 // Spin up a gourtine that'll uploading artifact statuses every few // seconds in batches go func() { for artifactStatesUploaded < len(artifacts) { statesToUpload := make(map[string]string) // Grab all the states we need to upload, and remove // them from the tracking map for id, state := range artifactsStates { statesToUpload[id] = state delete(artifactsStates, id) } if len(statesToUpload) > 0 { artifactStatesUploaded += len(statesToUpload) for id, state := range statesToUpload { logger.Debug("Artifact `%s` has state `%s`", id, state) } // Update the states of the artifacts in bulk. err = retry.Do(func(s *retry.Stats) error { _, err = a.APIClient.Artifacts.Update(a.JobID, statesToUpload) if err != nil { logger.Warn("%s (%s)", err, s) } return err }, &retry.Config{Maximum: 10, Interval: 1 * time.Second}) if err != nil { logger.Error("Error uploading artifact states: %s", err) // Track the error that was raised p.Lock() errors = append(errors, err) p.Unlock() } logger.Debug("Uploaded %d artfact states (%d/%d)", len(statesToUpload), artifactStatesUploaded, len(artifacts)) } // Check again for states to upload in a few seconds time.Sleep(1 * time.Second) } stateUploaderWaitGroup.Done() }() for _, artifact := range artifacts { // Create new instance of the artifact for the goroutine // See: http://golang.org/doc/effective_go.html#channels artifact := artifact p.Spawn(func() { // Show a nice message that we're starting to upload the file logger.Info("Uploading \"%s\" %d bytes", artifact.Path, artifact.FileSize) // Upload the artifact and then set the state depending // on whether or not it passed. We'll retry the upload // a couple of times before giving up. err = retry.Do(func(s *retry.Stats) error { err := uploader.Upload(artifact) if err != nil { logger.Warn("%s (%s)", err, s) } return err }, &retry.Config{Maximum: 10, Interval: 1 * time.Second}) var state string // Did the upload eventually fail? if err != nil { logger.Error("Error uploading artifact \"%s\": %s", artifact.Path, err) // Track the error that was raised p.Lock() errors = append(errors, err) p.Unlock() state = "error" } else { state = "finished" } artifactsStates[artifact.ID] = state }) } // Wait for the pool to finish p.Wait() // Wait for the statuses to finish uploading stateUploaderWaitGroup.Wait() if len(errors) > 0 { logger.Fatal("There were errors with uploading some of the artifacts") } return nil }