func (s *stepUploadScript) Run(state multistep.StateBag) multistep.StepAction { ctx := state.Get("ctx").(gocontext.Context) buildJob := state.Get("buildJob").(Job) instance := state.Get("instance").(backend.Instance) script := state.Get("script").([]byte) ctx, cancel := gocontext.WithTimeout(ctx, s.uploadTimeout) defer cancel() err := instance.UploadScript(ctx, script) if err != nil { errMetric := "worker.job.upload.error" if err == backend.ErrStaleVM { errMetric += ".stalevm" } metrics.Mark(errMetric) context.LoggerFromContext(ctx).WithField("err", err).Error("couldn't upload script, attemping requeue") err := buildJob.Requeue() if err != nil { context.LoggerFromContext(ctx).WithField("err", err).Error("couldn't requeue job") } return multistep.ActionHalt } context.LoggerFromContext(ctx).Info("uploaded script") return multistep.ActionContinue }
func (i *gceInstance) Stop(ctx gocontext.Context) error { logger := context.LoggerFromContext(ctx) state := &multistep.BasicStateBag{} c := &gceInstanceStopContext{ ctx: ctx, errChan: make(chan error), } runner := &multistep.BasicRunner{ Steps: []multistep.Step{ &gceInstanceStopMultistepWrapper{c: c, f: i.stepDeleteInstance}, &gceInstanceStopMultistepWrapper{c: c, f: i.stepWaitForInstanceDeleted}, }, } logger.WithField("instance", i.instance.Name).Info("deleting instance") go runner.Run(state) logger.Debug("selecting over error and done channels") select { case err := <-c.errChan: return err case <-ctx.Done(): if ctx.Err() == gocontext.DeadlineExceeded { metrics.Mark("worker.vm.provider.gce.delete.timeout") } return ctx.Err() } }
func (b *blueBoxProvider) Start(ctx gocontext.Context, startAttributes *StartAttributes) (Instance, error) { password := generatePassword() params := goblueboxapi.BlockParams{ Product: b.cfg.Get("PRODUCT_ID"), Template: b.templateIDForLanguageGroup(startAttributes.Language, startAttributes.Group), Location: b.cfg.Get("LOCATION_ID"), Hostname: fmt.Sprintf("testing-bb-%s", uuid.NewRandom()), Username: "******", Password: password, IPv6Only: b.cfg.Get("IPV6_ONLY") == "true", } startBooting := time.Now() block, err := b.client.Blocks.Create(params) if err != nil { return nil, err } blockReady := make(chan *goblueboxapi.Block) go func(id string) { for { b, err := b.client.Blocks.Get(id) if err == nil && b.Status == "running" { blockReady <- b return } time.Sleep(5 * time.Second) } }(block.ID) select { case block := <-blockReady: metrics.TimeSince("worker.vm.provider.bluebox.boot", startBooting) return &blueBoxInstance{ client: b.client, block: block, password: password, }, nil case <-ctx.Done(): if block != nil { err := b.client.Blocks.Destroy(block.ID) if err != nil { context.LoggerFromContext(ctx).WithField("block", block).WithField("err", err).Error("could not destroy block") } } if ctx.Err() == gocontext.DeadlineExceeded { metrics.Mark("worker.vm.provider.bluebox.boot.timeout") } return nil, ctx.Err() } }
func (j *amqpJob) Requeue() error { metrics.Mark("worker.job.requeue") err := j.sendStateUpdate("job:test:reset", map[string]interface{}{ "id": j.Payload().Job.ID, "state": "reset", }) if err != nil { return err } return j.delivery.Ack(false) }
func (p *gceProvider) Start(ctx gocontext.Context, startAttributes *StartAttributes) (Instance, error) { logger := context.LoggerFromContext(ctx) state := &multistep.BasicStateBag{} c := &gceStartContext{ startAttributes: startAttributes, ctx: ctx, instChan: make(chan Instance), errChan: make(chan error), } runner := &multistep.BasicRunner{ Steps: []multistep.Step{ &gceStartMultistepWrapper{c: c, f: p.stepGetImage}, &gceStartMultistepWrapper{c: c, f: p.stepRenderScript}, &gceStartMultistepWrapper{c: c, f: p.stepInsertInstance}, &gceStartMultistepWrapper{c: c, f: p.stepWaitForInstanceIP}, }, } abandonedStart := false defer func(c *gceStartContext) { if c.instance != nil && abandonedStart { p.apiRateLimit() _, _ = p.client.Instances.Delete(p.projectID, p.ic.Zone.Name, c.instance.Name).Do() } }(c) logger.Info("starting instance") go runner.Run(state) logger.Debug("selecting over instance, error, and done channels") select { case inst := <-c.instChan: return inst, nil case err := <-c.errChan: abandonedStart = true return nil, err case <-ctx.Done(): if ctx.Err() == gocontext.DeadlineExceeded { metrics.Mark("worker.vm.provider.gce.boot.timeout") } abandonedStart = true return nil, ctx.Err() } }
func (i *gceInstance) RunScript(ctx gocontext.Context, output io.Writer) (*RunResult, error) { client, err := i.sshClient(ctx) if err != nil { return &RunResult{Completed: false}, err } defer client.Close() session, err := client.NewSession() if err != nil { return &RunResult{Completed: false}, err } defer session.Close() err = session.RequestPty("xterm", 40, 80, ssh.TerminalModes{}) if err != nil { return &RunResult{Completed: false}, err } session.Stdout = output session.Stderr = output err = session.Run("bash ~/build.sh") preempted, googleErr := i.isPreempted(ctx) if googleErr != nil { context.LoggerFromContext(ctx).WithField("err", googleErr).Error("couldn't determine if instance was preempted") // could not get answer from google // requeue just in case return &RunResult{Completed: false}, googleErr } if preempted { metrics.Mark("travis.worker.gce.preempted-instances") return &RunResult{Completed: false}, nil } if err == nil { return &RunResult{Completed: true, ExitCode: 0}, nil } switch err := err.(type) { case *ssh.ExitError: return &RunResult{Completed: true, ExitCode: uint8(err.ExitStatus())}, nil default: return &RunResult{Completed: false}, err } }
func (j *fileJob) Requeue() error { metrics.Mark("worker.job.requeue") var err error for _, fname := range []string{ j.receivedFile, j.startedFile, j.finishedFile, } { err = os.Rename(fname, j.createdFile) if err == nil { return nil } } return err }
func (i *blueBoxInstance) sshClient(ctx gocontext.Context) (*ssh.Client, error) { if len(i.block.IPs) == 0 { return nil, errNoBlueBoxIP } client, err := ssh.Dial("tcp6", fmt.Sprintf("[%s]:22", i.block.IPs[0].Address), &ssh.ClientConfig{ User: "******", Auth: []ssh.AuthMethod{ ssh.Password(i.password), }, }) if err != nil { metrics.Mark("worker.vm.provider.bluebox.ssh.error") context.LoggerFromContext(ctx).WithField("block", i.block).WithField("vsh_id", i.block.VSHID).WithField("err", err).Error("error connecting to SSH") } return client, err }
func (p *gceProvider) stepWaitForInstanceIP(c *gceStartContext) multistep.StepAction { logger := context.LoggerFromContext(c.ctx) logger.WithFields(logrus.Fields{ "duration": p.bootPrePollSleep, }).Debug("sleeping before first checking instance insert operation") time.Sleep(p.bootPrePollSleep) zoneOpCall := p.client.ZoneOperations.Get(p.projectID, p.ic.Zone.Name, c.instanceInsertOp.Name) for { metrics.Mark("worker.vm.provider.gce.boot.poll") p.apiRateLimit() newOp, err := zoneOpCall.Do() if err != nil { c.errChan <- err return multistep.ActionHalt } if newOp.Status == "RUNNING" || newOp.Status == "DONE" { if newOp.Error != nil { c.errChan <- &gceOpError{Err: newOp.Error} return multistep.ActionHalt } logger.WithFields(logrus.Fields{ "status": newOp.Status, "name": c.instanceInsertOp.Name, }).Debug("instance is ready") c.instChan <- &gceInstance{ client: p.client, provider: p, instance: c.instance, ic: p.ic, authUser: "******", projectID: p.projectID, imageName: c.image.Name, startupDuration: time.Now().UTC().Sub(c.bootStart), } return multistep.ActionContinue } if newOp.Error != nil { logger.WithFields(logrus.Fields{ "err": newOp.Error, "name": c.instanceInsertOp.Name, }).Error("encountered an error while waiting for instance insert operation") c.errChan <- &gceOpError{Err: newOp.Error} return multistep.ActionHalt } logger.WithFields(logrus.Fields{ "status": newOp.Status, "name": c.instanceInsertOp.Name, "duration": p.bootPollSleep, }).Debug("sleeping before checking instance insert operation") time.Sleep(p.bootPollSleep) } }
func (p *jupiterBrainProvider) Start(ctx context.Context, startAttributes *StartAttributes) (Instance, error) { u, err := p.baseURL.Parse("instances") if err != nil { return nil, err } imageName := p.getImageName(startAttributes) if imageName == "" { return nil, fmt.Errorf("no image alias for %#v", startAttributes) } workerctx.LoggerFromContext(ctx).WithFields(logrus.Fields{ "image_name": imageName, "osx_image": startAttributes.OsxImage, "language": startAttributes.Language, "dist": startAttributes.Dist, "group": startAttributes.Group, "os": startAttributes.OS, }).Info("selected image name") startBooting := time.Now() bodyPayload := map[string]map[string]string{ "data": { "type": "instances", "base-image": imageName, }, } jsonBody, err := json.Marshal(bodyPayload) if err != nil { return nil, err } req, err := http.NewRequest("POST", u.String(), bytes.NewReader(jsonBody)) if err != nil { return nil, err } req.Header.Set("Content-Type", "application/vnd.api+json") resp, err := p.httpDo(req) if err != nil { return nil, err } defer io.Copy(ioutil.Discard, resp.Body) defer resp.Body.Close() if c := resp.StatusCode; c < 200 || c >= 300 { body, _ := ioutil.ReadAll(resp.Body) return nil, fmt.Errorf("expected 2xx from Jupiter Brain API, got %d (error: %s)", c, body) } dataPayload := &jupiterBrainDataResponse{} err = json.NewDecoder(resp.Body).Decode(dataPayload) if err != nil { workerctx.LoggerFromContext(ctx).WithFields(logrus.Fields{ "err": err, "payload": dataPayload, "body": resp.Body, }).Error("couldn't decode created payload") return nil, fmt.Errorf("couldn't decode created payload: %s", err) } payload := dataPayload.Data[0] instanceReady := make(chan *jupiterBrainInstancePayload, 1) errChan := make(chan error, 1) go func(id string) { u, err := p.baseURL.Parse(fmt.Sprintf("instances/%s", url.QueryEscape(id))) if err != nil { errChan <- err return } req, err := http.NewRequest("GET", u.String(), nil) if err != nil { errChan <- err return } for { resp, err := p.httpDo(req) if err != nil { errChan <- err return } if resp.StatusCode != 200 { body, _ := ioutil.ReadAll(resp.Body) errChan <- fmt.Errorf("unknown status code: %d, expected 200 (body: %q)", resp.StatusCode, string(body)) return } dataPayload := &jupiterBrainDataResponse{} err = json.NewDecoder(resp.Body).Decode(dataPayload) if err != nil { errChan <- fmt.Errorf("couldn't decode refresh payload: %s", err) return } payload := dataPayload.Data[0] _, _ = io.Copy(ioutil.Discard, resp.Body) _ = resp.Body.Close() var ip net.IP for _, ipString := range payload.IPAddresses { curIP := net.ParseIP(ipString) if curIP.To4() != nil { ip = curIP break } } if ip == nil { time.Sleep(p.bootPollSleep) continue } conn, err := net.Dial("tcp", fmt.Sprintf("%s:22", ip.String())) if conn != nil { conn.Close() } if err == nil { instanceReady <- payload return } time.Sleep(p.bootPollSleep) } }(payload.ID) select { case payload := <-instanceReady: metrics.TimeSince("worker.vm.provider.jupiterbrain.boot", startBooting) normalizedImageName := string(metricNameCleanRegexp.ReplaceAll([]byte(imageName), []byte("-"))) metrics.TimeSince(fmt.Sprintf("worker.vm.provider.jupiterbrain.boot.image.%s", normalizedImageName), startBooting) workerctx.LoggerFromContext(ctx).WithField("instance_uuid", payload.ID).Info("booted instance") return &jupiterBrainInstance{ payload: payload, provider: p, }, nil case err := <-errChan: instance := &jupiterBrainInstance{ payload: payload, provider: p, } instance.Stop(ctx) return nil, err case <-ctx.Done(): if ctx.Err() == context.DeadlineExceeded { metrics.Mark("worker.vm.provider.jupiterbrain.boot.timeout") } instance := &jupiterBrainInstance{ payload: payload, provider: p, } instance.Stop(ctx) return nil, ctx.Err() } }
func (p *gceProvider) Start(ctx gocontext.Context, startAttributes *StartAttributes) (Instance, error) { logger := context.LoggerFromContext(ctx) image, err := p.getImage(ctx, startAttributes) if err != nil { return nil, err } scriptBuf := bytes.Buffer{} err = gceStartupScript.Execute(&scriptBuf, p.ic) if err != nil { return nil, err } inst := p.buildInstance(startAttributes, image.SelfLink, scriptBuf.String()) logger.WithFields(logrus.Fields{ "instance": inst, }).Debug("inserting instance") op, err := p.client.Instances.Insert(p.projectID, p.ic.Zone.Name, inst).Do() if err != nil { return nil, err } abandonedStart := false defer func() { if abandonedStart { _, _ = p.client.Instances.Delete(p.projectID, p.ic.Zone.Name, inst.Name).Do() } }() startBooting := time.Now() var instChan chan *compute.Instance instanceReady := make(chan *compute.Instance) instChan = instanceReady errChan := make(chan error) go func() { for { newOp, err := p.client.ZoneOperations.Get(p.projectID, p.ic.Zone.Name, op.Name).Do() if err != nil { errChan <- err return } if newOp.Status == "DONE" { if newOp.Error != nil { errChan <- &gceOpError{Err: newOp.Error} return } logger.WithFields(logrus.Fields{ "status": newOp.Status, "name": op.Name, }).Debug("instance is ready") instanceReady <- inst return } if newOp.Error != nil { logger.WithFields(logrus.Fields{ "err": newOp.Error, "name": op.Name, }).Error("encountered an error while waiting for instance insert operation") errChan <- &gceOpError{Err: newOp.Error} return } logger.WithFields(logrus.Fields{ "status": newOp.Status, "name": op.Name, }).Debug("sleeping before checking instance insert operation") time.Sleep(p.bootPollSleep) } }() if p.instanceGroup != "" { logger.WithFields(logrus.Fields{ "instance": inst, "instance_group": p.instanceGroup, }).Debug("instance group is non-empty, adding instance to group") origInstanceReady := instanceReady instChan = make(chan *compute.Instance) err = func() error { for { select { case readyInst := <-origInstanceReady: inst = readyInst logger.WithFields(logrus.Fields{ "instance": inst, "instance_group": p.instanceGroup, }).Debug("inserting instance into group") return nil case <-ctx.Done(): if ctx.Err() == gocontext.DeadlineExceeded { metrics.Mark("worker.vm.provider.gce.boot.timeout") } abandonedStart = true return ctx.Err() default: logger.Debug("sleeping while waiting for instance to be ready") time.Sleep(p.bootPollSleep) } } }() if err != nil { return nil, err } inst, err = p.client.Instances.Get(p.projectID, p.ic.Zone.Name, inst.Name).Do() if err != nil { return nil, err } ref := &compute.InstanceReference{ Instance: inst.SelfLink, } logger.WithFields(logrus.Fields{ "ref": ref, "instance_self_link": inst.SelfLink, }).Debug("inserting instance into group with ref") op, err := p.client.InstanceGroups.AddInstances(p.projectID, p.ic.Zone.Name, p.instanceGroup, &compute.InstanceGroupsAddInstancesRequest{ Instances: []*compute.InstanceReference{ref}, }).Do() if err != nil { abandonedStart = true return nil, err } logger.WithFields(logrus.Fields{ "instance": inst, "instance_group": p.instanceGroup, }).Debug("starting goroutine to poll for instance group addition") go func() { for { newOp, err := p.client.ZoneOperations.Get(p.projectID, p.ic.Zone.Name, op.Name).Do() if err != nil { errChan <- err return } if newOp.Status == "DONE" { if newOp.Error != nil { errChan <- &gceOpError{Err: newOp.Error} return } instChan <- inst return } if newOp.Error != nil { logger.WithFields(logrus.Fields{ "err": newOp.Error, "name": op.Name, }).Error("encountered an error while waiting for instance group addition operation") errChan <- &gceOpError{Err: newOp.Error} return } logger.WithFields(logrus.Fields{ "status": newOp.Status, "name": op.Name, }).Debug("sleeping before checking instance group addition operation") time.Sleep(p.bootPollSleep) } }() } logger.Debug("selecting over instance, error, and done channels") select { case inst := <-instChan: metrics.TimeSince("worker.vm.provider.gce.boot", startBooting) return &gceInstance{ client: p.client, provider: p, instance: inst, ic: p.ic, authUser: "******", projectID: p.projectID, imageName: image.Name, }, nil case err := <-errChan: abandonedStart = true return nil, err case <-ctx.Done(): if ctx.Err() == gocontext.DeadlineExceeded { metrics.Mark("worker.vm.provider.gce.boot.timeout") } abandonedStart = true return nil, ctx.Err() } }
func (p *dockerProvider) Start(ctx gocontext.Context, startAttributes *StartAttributes) (Instance, error) { logger := context.LoggerFromContext(ctx) cpuSets, err := p.checkoutCPUSets() if err != nil && cpuSets != "" { return nil, err } imageID, imageName, err := p.imageForLanguage(startAttributes.Language) if err != nil { return nil, err } dockerConfig := &docker.Config{ Cmd: p.runCmd, Image: imageID, Memory: int64(p.runMemory), Hostname: fmt.Sprintf("testing-docker-%s", uuid.NewRandom()), } dockerHostConfig := &docker.HostConfig{ Privileged: p.runPrivileged, Memory: int64(p.runMemory), } if cpuSets != "" { dockerConfig.CPUSet = cpuSets dockerHostConfig.CPUSet = cpuSets } logger.WithFields(logrus.Fields{ "config": fmt.Sprintf("%#v", dockerConfig), "host_config": fmt.Sprintf("%#v", dockerHostConfig), }).Debug("starting container") container, err := p.client.CreateContainer(docker.CreateContainerOptions{ Config: dockerConfig, HostConfig: dockerHostConfig, }) if err != nil { if container != nil { err := p.client.RemoveContainer(docker.RemoveContainerOptions{ ID: container.ID, RemoveVolumes: true, Force: true, }) if err != nil { logger.WithField("err", err).Error("couldn't remove container after create failure") } } return nil, err } startBooting := time.Now() err = p.client.StartContainer(container.ID, dockerHostConfig) if err != nil { return nil, err } containerReady := make(chan *docker.Container) errChan := make(chan error) go func(id string) { for { container, err := p.client.InspectContainer(id) if err != nil { errChan <- err return } if container.State.Running { containerReady <- container return } } }(container.ID) select { case container := <-containerReady: metrics.TimeSince("worker.vm.provider.docker.boot", startBooting) return &dockerInstance{ client: p.client, provider: p, container: container, imageName: imageName, }, nil case err := <-errChan: return nil, err case <-ctx.Done(): if ctx.Err() == gocontext.DeadlineExceeded { metrics.Mark("worker.vm.provider.docker.boot.timeout") } return nil, ctx.Err() } }