Ejemplo n.º 1
0
func (s *stepUploadScript) Run(state multistep.StateBag) multistep.StepAction {
	ctx := state.Get("ctx").(gocontext.Context)
	buildJob := state.Get("buildJob").(Job)

	instance := state.Get("instance").(backend.Instance)
	script := state.Get("script").([]byte)

	ctx, cancel := gocontext.WithTimeout(ctx, s.uploadTimeout)
	defer cancel()

	err := instance.UploadScript(ctx, script)
	if err != nil {
		errMetric := "worker.job.upload.error"
		if err == backend.ErrStaleVM {
			errMetric += ".stalevm"
		}
		metrics.Mark(errMetric)

		context.LoggerFromContext(ctx).WithField("err", err).Error("couldn't upload script, attemping requeue")

		err := buildJob.Requeue()
		if err != nil {
			context.LoggerFromContext(ctx).WithField("err", err).Error("couldn't requeue job")
		}

		return multistep.ActionHalt
	}

	context.LoggerFromContext(ctx).Info("uploaded script")

	return multistep.ActionContinue
}
Ejemplo n.º 2
0
func (i *gceInstance) Stop(ctx gocontext.Context) error {
	logger := context.LoggerFromContext(ctx)
	state := &multistep.BasicStateBag{}

	c := &gceInstanceStopContext{
		ctx:     ctx,
		errChan: make(chan error),
	}

	runner := &multistep.BasicRunner{
		Steps: []multistep.Step{
			&gceInstanceStopMultistepWrapper{c: c, f: i.stepDeleteInstance},
			&gceInstanceStopMultistepWrapper{c: c, f: i.stepWaitForInstanceDeleted},
		},
	}

	logger.WithField("instance", i.instance.Name).Info("deleting instance")
	go runner.Run(state)

	logger.Debug("selecting over error and done channels")
	select {
	case err := <-c.errChan:
		return err
	case <-ctx.Done():
		if ctx.Err() == gocontext.DeadlineExceeded {
			metrics.Mark("worker.vm.provider.gce.delete.timeout")
		}
		return ctx.Err()
	}
}
Ejemplo n.º 3
0
func (b *blueBoxProvider) Start(ctx gocontext.Context, startAttributes *StartAttributes) (Instance, error) {
	password := generatePassword()
	params := goblueboxapi.BlockParams{
		Product:  b.cfg.Get("PRODUCT_ID"),
		Template: b.templateIDForLanguageGroup(startAttributes.Language, startAttributes.Group),
		Location: b.cfg.Get("LOCATION_ID"),
		Hostname: fmt.Sprintf("testing-bb-%s", uuid.NewRandom()),
		Username: "******",
		Password: password,
		IPv6Only: b.cfg.Get("IPV6_ONLY") == "true",
	}

	startBooting := time.Now()

	block, err := b.client.Blocks.Create(params)
	if err != nil {
		return nil, err
	}

	blockReady := make(chan *goblueboxapi.Block)
	go func(id string) {
		for {
			b, err := b.client.Blocks.Get(id)
			if err == nil && b.Status == "running" {
				blockReady <- b
				return
			}

			time.Sleep(5 * time.Second)
		}
	}(block.ID)

	select {
	case block := <-blockReady:
		metrics.TimeSince("worker.vm.provider.bluebox.boot", startBooting)
		return &blueBoxInstance{
			client:   b.client,
			block:    block,
			password: password,
		}, nil
	case <-ctx.Done():
		if block != nil {
			err := b.client.Blocks.Destroy(block.ID)
			if err != nil {
				context.LoggerFromContext(ctx).WithField("block", block).WithField("err", err).Error("could not destroy block")
			}
		}

		if ctx.Err() == gocontext.DeadlineExceeded {
			metrics.Mark("worker.vm.provider.bluebox.boot.timeout")
		}
		return nil, ctx.Err()
	}
}
Ejemplo n.º 4
0
func (j *amqpJob) Requeue() error {
	metrics.Mark("worker.job.requeue")

	err := j.sendStateUpdate("job:test:reset", map[string]interface{}{
		"id":    j.Payload().Job.ID,
		"state": "reset",
	})
	if err != nil {
		return err
	}

	return j.delivery.Ack(false)
}
Ejemplo n.º 5
0
func (p *gceProvider) Start(ctx gocontext.Context, startAttributes *StartAttributes) (Instance, error) {
	logger := context.LoggerFromContext(ctx)

	state := &multistep.BasicStateBag{}

	c := &gceStartContext{
		startAttributes: startAttributes,
		ctx:             ctx,
		instChan:        make(chan Instance),
		errChan:         make(chan error),
	}

	runner := &multistep.BasicRunner{
		Steps: []multistep.Step{
			&gceStartMultistepWrapper{c: c, f: p.stepGetImage},
			&gceStartMultistepWrapper{c: c, f: p.stepRenderScript},
			&gceStartMultistepWrapper{c: c, f: p.stepInsertInstance},
			&gceStartMultistepWrapper{c: c, f: p.stepWaitForInstanceIP},
		},
	}

	abandonedStart := false

	defer func(c *gceStartContext) {
		if c.instance != nil && abandonedStart {
			p.apiRateLimit()
			_, _ = p.client.Instances.Delete(p.projectID, p.ic.Zone.Name, c.instance.Name).Do()
		}
	}(c)

	logger.Info("starting instance")
	go runner.Run(state)

	logger.Debug("selecting over instance, error, and done channels")
	select {
	case inst := <-c.instChan:
		return inst, nil
	case err := <-c.errChan:
		abandonedStart = true
		return nil, err
	case <-ctx.Done():
		if ctx.Err() == gocontext.DeadlineExceeded {
			metrics.Mark("worker.vm.provider.gce.boot.timeout")
		}
		abandonedStart = true
		return nil, ctx.Err()
	}
}
Ejemplo n.º 6
0
func (i *gceInstance) RunScript(ctx gocontext.Context, output io.Writer) (*RunResult, error) {
	client, err := i.sshClient(ctx)
	if err != nil {
		return &RunResult{Completed: false}, err
	}
	defer client.Close()

	session, err := client.NewSession()
	if err != nil {
		return &RunResult{Completed: false}, err
	}
	defer session.Close()

	err = session.RequestPty("xterm", 40, 80, ssh.TerminalModes{})
	if err != nil {
		return &RunResult{Completed: false}, err
	}

	session.Stdout = output
	session.Stderr = output

	err = session.Run("bash ~/build.sh")

	preempted, googleErr := i.isPreempted(ctx)
	if googleErr != nil {
		context.LoggerFromContext(ctx).WithField("err", googleErr).Error("couldn't determine if instance was preempted")
		// could not get answer from google
		// requeue just in case
		return &RunResult{Completed: false}, googleErr
	}
	if preempted {
		metrics.Mark("travis.worker.gce.preempted-instances")
		return &RunResult{Completed: false}, nil
	}

	if err == nil {
		return &RunResult{Completed: true, ExitCode: 0}, nil
	}

	switch err := err.(type) {
	case *ssh.ExitError:
		return &RunResult{Completed: true, ExitCode: uint8(err.ExitStatus())}, nil
	default:
		return &RunResult{Completed: false}, err
	}
}
Ejemplo n.º 7
0
func (j *fileJob) Requeue() error {
	metrics.Mark("worker.job.requeue")

	var err error

	for _, fname := range []string{
		j.receivedFile,
		j.startedFile,
		j.finishedFile,
	} {
		err = os.Rename(fname, j.createdFile)
		if err == nil {
			return nil
		}
	}

	return err
}
Ejemplo n.º 8
0
func (i *blueBoxInstance) sshClient(ctx gocontext.Context) (*ssh.Client, error) {
	if len(i.block.IPs) == 0 {
		return nil, errNoBlueBoxIP
	}

	client, err := ssh.Dial("tcp6", fmt.Sprintf("[%s]:22", i.block.IPs[0].Address), &ssh.ClientConfig{
		User: "******",
		Auth: []ssh.AuthMethod{
			ssh.Password(i.password),
		},
	})

	if err != nil {
		metrics.Mark("worker.vm.provider.bluebox.ssh.error")
		context.LoggerFromContext(ctx).WithField("block", i.block).WithField("vsh_id", i.block.VSHID).WithField("err", err).Error("error connecting to SSH")
	}

	return client, err
}
Ejemplo n.º 9
0
func (p *gceProvider) stepWaitForInstanceIP(c *gceStartContext) multistep.StepAction {
	logger := context.LoggerFromContext(c.ctx)

	logger.WithFields(logrus.Fields{
		"duration": p.bootPrePollSleep,
	}).Debug("sleeping before first checking instance insert operation")

	time.Sleep(p.bootPrePollSleep)

	zoneOpCall := p.client.ZoneOperations.Get(p.projectID, p.ic.Zone.Name, c.instanceInsertOp.Name)

	for {
		metrics.Mark("worker.vm.provider.gce.boot.poll")

		p.apiRateLimit()
		newOp, err := zoneOpCall.Do()
		if err != nil {
			c.errChan <- err
			return multistep.ActionHalt
		}

		if newOp.Status == "RUNNING" || newOp.Status == "DONE" {
			if newOp.Error != nil {
				c.errChan <- &gceOpError{Err: newOp.Error}
				return multistep.ActionHalt
			}

			logger.WithFields(logrus.Fields{
				"status": newOp.Status,
				"name":   c.instanceInsertOp.Name,
			}).Debug("instance is ready")

			c.instChan <- &gceInstance{
				client:   p.client,
				provider: p,
				instance: c.instance,
				ic:       p.ic,

				authUser: "******",

				projectID: p.projectID,
				imageName: c.image.Name,

				startupDuration: time.Now().UTC().Sub(c.bootStart),
			}
			return multistep.ActionContinue
		}

		if newOp.Error != nil {
			logger.WithFields(logrus.Fields{
				"err":  newOp.Error,
				"name": c.instanceInsertOp.Name,
			}).Error("encountered an error while waiting for instance insert operation")

			c.errChan <- &gceOpError{Err: newOp.Error}
			return multistep.ActionHalt
		}

		logger.WithFields(logrus.Fields{
			"status":   newOp.Status,
			"name":     c.instanceInsertOp.Name,
			"duration": p.bootPollSleep,
		}).Debug("sleeping before checking instance insert operation")

		time.Sleep(p.bootPollSleep)
	}
}
Ejemplo n.º 10
0
func (p *jupiterBrainProvider) Start(ctx context.Context, startAttributes *StartAttributes) (Instance, error) {
	u, err := p.baseURL.Parse("instances")
	if err != nil {
		return nil, err
	}

	imageName := p.getImageName(startAttributes)

	if imageName == "" {
		return nil, fmt.Errorf("no image alias for %#v", startAttributes)
	}

	workerctx.LoggerFromContext(ctx).WithFields(logrus.Fields{
		"image_name": imageName,
		"osx_image":  startAttributes.OsxImage,
		"language":   startAttributes.Language,
		"dist":       startAttributes.Dist,
		"group":      startAttributes.Group,
		"os":         startAttributes.OS,
	}).Info("selected image name")

	startBooting := time.Now()

	bodyPayload := map[string]map[string]string{
		"data": {
			"type":       "instances",
			"base-image": imageName,
		},
	}

	jsonBody, err := json.Marshal(bodyPayload)
	if err != nil {
		return nil, err
	}

	req, err := http.NewRequest("POST", u.String(), bytes.NewReader(jsonBody))
	if err != nil {
		return nil, err
	}
	req.Header.Set("Content-Type", "application/vnd.api+json")

	resp, err := p.httpDo(req)
	if err != nil {
		return nil, err
	}
	defer io.Copy(ioutil.Discard, resp.Body)
	defer resp.Body.Close()

	if c := resp.StatusCode; c < 200 || c >= 300 {
		body, _ := ioutil.ReadAll(resp.Body)
		return nil, fmt.Errorf("expected 2xx from Jupiter Brain API, got %d (error: %s)", c, body)
	}

	dataPayload := &jupiterBrainDataResponse{}
	err = json.NewDecoder(resp.Body).Decode(dataPayload)
	if err != nil {
		workerctx.LoggerFromContext(ctx).WithFields(logrus.Fields{
			"err":     err,
			"payload": dataPayload,
			"body":    resp.Body,
		}).Error("couldn't decode created payload")
		return nil, fmt.Errorf("couldn't decode created payload: %s", err)
	}

	payload := dataPayload.Data[0]

	instanceReady := make(chan *jupiterBrainInstancePayload, 1)
	errChan := make(chan error, 1)
	go func(id string) {
		u, err := p.baseURL.Parse(fmt.Sprintf("instances/%s", url.QueryEscape(id)))
		if err != nil {
			errChan <- err
			return
		}

		req, err := http.NewRequest("GET", u.String(), nil)
		if err != nil {
			errChan <- err
			return
		}

		for {
			resp, err := p.httpDo(req)
			if err != nil {
				errChan <- err
				return
			}

			if resp.StatusCode != 200 {
				body, _ := ioutil.ReadAll(resp.Body)
				errChan <- fmt.Errorf("unknown status code: %d, expected 200 (body: %q)", resp.StatusCode, string(body))
				return
			}

			dataPayload := &jupiterBrainDataResponse{}
			err = json.NewDecoder(resp.Body).Decode(dataPayload)
			if err != nil {
				errChan <- fmt.Errorf("couldn't decode refresh payload: %s", err)
				return
			}
			payload := dataPayload.Data[0]

			_, _ = io.Copy(ioutil.Discard, resp.Body)
			_ = resp.Body.Close()

			var ip net.IP
			for _, ipString := range payload.IPAddresses {
				curIP := net.ParseIP(ipString)
				if curIP.To4() != nil {
					ip = curIP
					break
				}

			}

			if ip == nil {
				time.Sleep(p.bootPollSleep)
				continue
			}

			conn, err := net.Dial("tcp", fmt.Sprintf("%s:22", ip.String()))
			if conn != nil {
				conn.Close()
			}

			if err == nil {
				instanceReady <- payload
				return
			}

			time.Sleep(p.bootPollSleep)
		}
	}(payload.ID)

	select {
	case payload := <-instanceReady:
		metrics.TimeSince("worker.vm.provider.jupiterbrain.boot", startBooting)
		normalizedImageName := string(metricNameCleanRegexp.ReplaceAll([]byte(imageName), []byte("-")))
		metrics.TimeSince(fmt.Sprintf("worker.vm.provider.jupiterbrain.boot.image.%s", normalizedImageName), startBooting)
		workerctx.LoggerFromContext(ctx).WithField("instance_uuid", payload.ID).Info("booted instance")
		return &jupiterBrainInstance{
			payload:  payload,
			provider: p,
		}, nil
	case err := <-errChan:
		instance := &jupiterBrainInstance{
			payload:  payload,
			provider: p,
		}
		instance.Stop(ctx)

		return nil, err
	case <-ctx.Done():
		if ctx.Err() == context.DeadlineExceeded {
			metrics.Mark("worker.vm.provider.jupiterbrain.boot.timeout")
		}

		instance := &jupiterBrainInstance{
			payload:  payload,
			provider: p,
		}
		instance.Stop(ctx)

		return nil, ctx.Err()
	}
}
Ejemplo n.º 11
0
func (p *gceProvider) Start(ctx gocontext.Context, startAttributes *StartAttributes) (Instance, error) {
	logger := context.LoggerFromContext(ctx)

	image, err := p.getImage(ctx, startAttributes)
	if err != nil {
		return nil, err
	}

	scriptBuf := bytes.Buffer{}
	err = gceStartupScript.Execute(&scriptBuf, p.ic)
	if err != nil {
		return nil, err
	}

	inst := p.buildInstance(startAttributes, image.SelfLink, scriptBuf.String())

	logger.WithFields(logrus.Fields{
		"instance": inst,
	}).Debug("inserting instance")
	op, err := p.client.Instances.Insert(p.projectID, p.ic.Zone.Name, inst).Do()
	if err != nil {
		return nil, err
	}

	abandonedStart := false

	defer func() {
		if abandonedStart {
			_, _ = p.client.Instances.Delete(p.projectID, p.ic.Zone.Name, inst.Name).Do()
		}
	}()

	startBooting := time.Now()

	var instChan chan *compute.Instance

	instanceReady := make(chan *compute.Instance)
	instChan = instanceReady

	errChan := make(chan error)
	go func() {
		for {
			newOp, err := p.client.ZoneOperations.Get(p.projectID, p.ic.Zone.Name, op.Name).Do()
			if err != nil {
				errChan <- err
				return
			}

			if newOp.Status == "DONE" {
				if newOp.Error != nil {
					errChan <- &gceOpError{Err: newOp.Error}
					return
				}

				logger.WithFields(logrus.Fields{
					"status": newOp.Status,
					"name":   op.Name,
				}).Debug("instance is ready")

				instanceReady <- inst
				return
			}

			if newOp.Error != nil {
				logger.WithFields(logrus.Fields{
					"err":  newOp.Error,
					"name": op.Name,
				}).Error("encountered an error while waiting for instance insert operation")

				errChan <- &gceOpError{Err: newOp.Error}
				return
			}

			logger.WithFields(logrus.Fields{
				"status": newOp.Status,
				"name":   op.Name,
			}).Debug("sleeping before checking instance insert operation")

			time.Sleep(p.bootPollSleep)
		}
	}()

	if p.instanceGroup != "" {
		logger.WithFields(logrus.Fields{
			"instance":       inst,
			"instance_group": p.instanceGroup,
		}).Debug("instance group is non-empty, adding instance to group")

		origInstanceReady := instanceReady
		instChan = make(chan *compute.Instance)

		err = func() error {
			for {
				select {
				case readyInst := <-origInstanceReady:
					inst = readyInst
					logger.WithFields(logrus.Fields{
						"instance":       inst,
						"instance_group": p.instanceGroup,
					}).Debug("inserting instance into group")
					return nil
				case <-ctx.Done():
					if ctx.Err() == gocontext.DeadlineExceeded {
						metrics.Mark("worker.vm.provider.gce.boot.timeout")
					}
					abandonedStart = true

					return ctx.Err()
				default:
					logger.Debug("sleeping while waiting for instance to be ready")
					time.Sleep(p.bootPollSleep)
				}
			}
		}()

		if err != nil {
			return nil, err
		}

		inst, err = p.client.Instances.Get(p.projectID, p.ic.Zone.Name, inst.Name).Do()
		if err != nil {
			return nil, err
		}

		ref := &compute.InstanceReference{
			Instance: inst.SelfLink,
		}

		logger.WithFields(logrus.Fields{
			"ref":                ref,
			"instance_self_link": inst.SelfLink,
		}).Debug("inserting instance into group with ref")

		op, err := p.client.InstanceGroups.AddInstances(p.projectID, p.ic.Zone.Name, p.instanceGroup, &compute.InstanceGroupsAddInstancesRequest{
			Instances: []*compute.InstanceReference{ref},
		}).Do()

		if err != nil {
			abandonedStart = true
			return nil, err
		}

		logger.WithFields(logrus.Fields{
			"instance":       inst,
			"instance_group": p.instanceGroup,
		}).Debug("starting goroutine to poll for instance group addition")

		go func() {
			for {
				newOp, err := p.client.ZoneOperations.Get(p.projectID, p.ic.Zone.Name, op.Name).Do()
				if err != nil {
					errChan <- err
					return
				}

				if newOp.Status == "DONE" {
					if newOp.Error != nil {
						errChan <- &gceOpError{Err: newOp.Error}
						return
					}

					instChan <- inst
					return
				}

				if newOp.Error != nil {
					logger.WithFields(logrus.Fields{
						"err":  newOp.Error,
						"name": op.Name,
					}).Error("encountered an error while waiting for instance group addition operation")

					errChan <- &gceOpError{Err: newOp.Error}
					return
				}

				logger.WithFields(logrus.Fields{
					"status": newOp.Status,
					"name":   op.Name,
				}).Debug("sleeping before checking instance group addition operation")

				time.Sleep(p.bootPollSleep)
			}
		}()
	}

	logger.Debug("selecting over instance, error, and done channels")
	select {
	case inst := <-instChan:
		metrics.TimeSince("worker.vm.provider.gce.boot", startBooting)
		return &gceInstance{
			client:   p.client,
			provider: p,
			instance: inst,
			ic:       p.ic,

			authUser: "******",

			projectID: p.projectID,
			imageName: image.Name,
		}, nil
	case err := <-errChan:
		abandonedStart = true
		return nil, err
	case <-ctx.Done():
		if ctx.Err() == gocontext.DeadlineExceeded {
			metrics.Mark("worker.vm.provider.gce.boot.timeout")
		}
		abandonedStart = true
		return nil, ctx.Err()
	}
}
Ejemplo n.º 12
0
func (p *dockerProvider) Start(ctx gocontext.Context, startAttributes *StartAttributes) (Instance, error) {
	logger := context.LoggerFromContext(ctx)

	cpuSets, err := p.checkoutCPUSets()
	if err != nil && cpuSets != "" {
		return nil, err
	}

	imageID, imageName, err := p.imageForLanguage(startAttributes.Language)
	if err != nil {
		return nil, err
	}

	dockerConfig := &docker.Config{
		Cmd:      p.runCmd,
		Image:    imageID,
		Memory:   int64(p.runMemory),
		Hostname: fmt.Sprintf("testing-docker-%s", uuid.NewRandom()),
	}

	dockerHostConfig := &docker.HostConfig{
		Privileged: p.runPrivileged,
		Memory:     int64(p.runMemory),
	}

	if cpuSets != "" {
		dockerConfig.CPUSet = cpuSets
		dockerHostConfig.CPUSet = cpuSets
	}

	logger.WithFields(logrus.Fields{
		"config":      fmt.Sprintf("%#v", dockerConfig),
		"host_config": fmt.Sprintf("%#v", dockerHostConfig),
	}).Debug("starting container")

	container, err := p.client.CreateContainer(docker.CreateContainerOptions{
		Config:     dockerConfig,
		HostConfig: dockerHostConfig,
	})

	if err != nil {
		if container != nil {
			err := p.client.RemoveContainer(docker.RemoveContainerOptions{
				ID:            container.ID,
				RemoveVolumes: true,
				Force:         true,
			})
			if err != nil {
				logger.WithField("err", err).Error("couldn't remove container after create failure")
			}
		}

		return nil, err
	}

	startBooting := time.Now()

	err = p.client.StartContainer(container.ID, dockerHostConfig)
	if err != nil {
		return nil, err
	}

	containerReady := make(chan *docker.Container)
	errChan := make(chan error)
	go func(id string) {
		for {
			container, err := p.client.InspectContainer(id)
			if err != nil {
				errChan <- err
				return
			}

			if container.State.Running {
				containerReady <- container
				return
			}
		}
	}(container.ID)

	select {
	case container := <-containerReady:
		metrics.TimeSince("worker.vm.provider.docker.boot", startBooting)
		return &dockerInstance{
			client:    p.client,
			provider:  p,
			container: container,
			imageName: imageName,
		}, nil
	case err := <-errChan:
		return nil, err
	case <-ctx.Done():
		if ctx.Err() == gocontext.DeadlineExceeded {
			metrics.Mark("worker.vm.provider.docker.boot.timeout")
		}
		return nil, ctx.Err()
	}
}