Example #1
0
// EventSend publishes an important message out to the world.
//
// On AWS messages are published to SNS. The Rack has an HTTP endpoint that is an SNS
// subscription, and when a message is delivered forwards them to all configured
// webhook services.
//
// Often the Rack has a Console webhook which facilitates forwarding events
// to Slack with additional formatting and filtering.
//
// Because these are important system events, they are also published to Segment
// for operational metrics.
func (p *AWSProvider) EventSend(e *structs.Event, err error) error {
	log := logger.New("ns=kernel")

	e.Status = "success"
	e.Timestamp = time.Now().UTC()

	if err != nil {
		e.Data["message"] = err.Error()
		e.Status = "error"
	}

	msg, err := json.Marshal(e)
	if err != nil {
		helpers.Error(log, err) // report internal errors to Rollbar
		return err
	}

	fmt.Printf("aws EventSend msg=%q\n", msg)

	// Publish Event to SNS
	resp, err := p.sns().Publish(&sns.PublishInput{
		Message:   aws.String(string(msg)), // Required
		Subject:   aws.String(e.Action),
		TargetArn: aws.String(os.Getenv("NOTIFICATION_TOPIC")),
	})
	if err != nil {
		helpers.Error(log, err) // report internal errors to Rollbar
		return err
	}

	log.At("EventSend").Log("message-id=%q", *resp.MessageId)

	// report event to Segment
	params := map[string]interface{}{
		"action": e.Action,
		"status": e.Status,
	}

	for k, v := range e.Data {
		params[k] = v
	}

	helpers.TrackEvent("event", params)

	return nil
}
Example #2
0
func recovery(rw http.ResponseWriter, r *http.Request, next http.HandlerFunc) {
	defer recoverWith(func(err error) {
		log := logger.New("ns=kernel").At("panic")
		helpers.Error(log, err)
		http.Error(rw, err.Error(), http.StatusInternalServerError)
	})

	next(rw, r)
}
Example #3
0
func StartCluster() {
	var log = logger.New("ns=cluster_monitor")

	defer recoverWith(func(err error) {
		helpers.Error(log, err)
	})

	for _ = range time.Tick(5 * time.Minute) {
		log.Log("tick")

		instances := Instances{}

		err := instances.describeASG()

		if err != nil {
			log.Error(err)
			continue
		}

		err = instances.describeECS()

		if err != nil {
			log.Error(err)
			continue
		}

		// TODO: Add an instances.testDocker() call to the mission critical path

		// Test if ASG Instance is registered and connected in ECS cluster
		for _, i := range instances {
			if !i.ASG {
				// TODO: Rogue instance?! Terminate?
				continue
			}

			if !i.ECS {
				// Not registered or not connected => set Unhealthy
				_, err := models.AutoScaling().SetInstanceHealth(
					&autoscaling.SetInstanceHealthInput{
						HealthStatus:             aws.String("Unhealthy"),
						InstanceId:               aws.String(i.Id),
						ShouldRespectGracePeriod: aws.Bool(true),
					},
				)

				i.Unhealthy = true

				if err != nil {
					log.Error(err)
					continue
				}
			}
		}

		log.Log(instances.log())
	}
}
Example #4
0
func StartCluster() {
	var log = logger.New("ns=cluster_monitor")

	defer recoverWith(func(err error) {
		helpers.Error(log, err)
	})

	for range time.Tick(5 * time.Minute) {
		log.Logf("tick")

		instances := Instances{}

		err := instances.describeASG()
		if err != nil {
			log.Error(err)
			continue
		}

		err = instances.describeECS()
		if err != nil {
			log.Error(err)
			continue
		}

		// Test if ASG Instance is registered and connected in ECS cluster
		for k, i := range instances {
			if !i.ASG {
				// TODO: Rogue instance?! Terminate?
				continue
			}

			if !i.ECS {
				// Not registered or not connected => set Unhealthy
				_, err := models.AutoScaling().SetInstanceHealth(
					&autoscaling.SetInstanceHealthInput{
						HealthStatus:             aws.String("Unhealthy"),
						InstanceId:               aws.String(i.Id),
						ShouldRespectGracePeriod: aws.Bool(true),
					},
				)

				i.Unhealthy = true
				instances[k] = i

				if err != nil {
					log.Error(err)
					continue
				}

				// log for humans
				fmt.Printf("who=\"convox/monitor\" what=\"marked instance %s unhealthy\" why=\"ECS reported agent disconnected\"\n", i.Id)
			}
		}

		log.Logf(instances.log())
	}
}
Example #5
0
func (p *AWSProvider) buildWait(a *structs.App, b *structs.Build, cmd *exec.Cmd, stdout io.ReadCloser) {
	// scan all output
	out := ""
	scanner := bufio.NewScanner(stdout)

	for scanner.Scan() {
		text := scanner.Text()
		out += text + "\n"

		p.kinesis().PutRecord(&kinesis.PutRecordInput{
			Data:         []byte(text),
			PartitionKey: aws.String(string(time.Now().UnixNano())),
			StreamName:   aws.String(a.Outputs["Kinesis"]),
		})
	}
	if err := scanner.Err(); err != nil {
		helpers.Error(nil, err) // send internal error to rollbar
	}

	// and wait for a return code
	werr := cmd.Wait()

	// reload build item to get data from BuildUpdate callback
	b, err := p.BuildGet(b.App, b.Id)
	if err != nil {
		helpers.Error(nil, err) // send internal error to rollbar
		return
	}

	// Wait / return code are errors, consider the build failed
	if werr != nil {
		b.Status = "failed"
	}

	// save final build logs / status
	b.Logs = string(out)
	err = p.BuildSave(b)
	if err != nil {
		helpers.Error(nil, err) // send internal error to rollbar
		return
	}
}
Example #6
0
func StartHeartbeat() {
	log := logger.New("ns=heartbeat")
	defer recoverWith(func(err error) {
		helpers.Error(log, err)
	})

	helpers.TrackEvent("kernel-heartbeat", "")

	for _ = range time.Tick(1 * time.Hour) {
		helpers.TrackEvent("kernel-heartbeat", "")
	}
}
Example #7
0
func StartHeartbeat() {
	log := logger.New("ns=heartbeat")

	defer recoverWith(func(err error) {
		helpers.Error(log, err)
	})

	heartbeat()

	for range time.Tick(1 * time.Hour) {
		heartbeat()
	}
}
Example #8
0
func (p *AWSProvider) buildRun(a *structs.App, b *structs.Build, args []string, env []string, stdin io.Reader) error {
	cmd := exec.Command("docker", args...)
	cmd.Env = env
	cmd.Stdin = stdin
	cmd.Stderr = cmd.Stdout // redirect cmd stderr to stdout

	stdout, err := cmd.StdoutPipe()
	if err != nil {
		helpers.Error(nil, err) // send internal error to rollbar
		return err
	}

	// start build command
	err = cmd.Start()
	if err != nil {
		helpers.Error(nil, err) // send internal error to rollbar
		return err
	}

	go p.buildWait(a, b, cmd, stdout)

	return nil
}
Example #9
0
func StartCluster() {
	var log = logger.New("ns=cluster_monitor")

	defer recoverWith(func(err error) {
		helpers.Error(log, err)
	})

Tick:
	for _ = range time.Tick(5 * time.Minute) {
		log.Log("tick")

		// Ger Rack InstanceCount Parameter
		instanceCount := 0
		// instanceType := "unknown"

		res, err := models.CloudFormation().DescribeStacks(
			&cloudformation.DescribeStacksInput{
				StackName: aws.String(os.Getenv("RACK")),
			},
		)

		if err != nil {
			log.Error(err)
			continue
		}

		for _, p := range res.Stacks[0].Parameters {
			if *p.ParameterKey == "InstanceCount" {
				c, err := strconv.Atoi(*p.ParameterValue)

				if err != nil {
					log.Error(err)
					break Tick
				}

				instanceCount = c
			}

			// if *p.ParameterKey == "InstanceType" {
			//   instanceType = *p.ParameterValue
			// }
		}

		// helpers.SendMixpanelEvent("kernel-cluster-monitor", fmt.Sprintf("count=%d type=%s", instanceCount, instanceType))

		// List and Describe ECS Container Instances
		ires, err := models.ECS().ListContainerInstances(
			&ecs.ListContainerInstancesInput{
				Cluster: aws.String(os.Getenv("CLUSTER")),
			},
		)

		if err != nil {
			log.Error(err)
			continue
		}

		dres, err := models.ECS().DescribeContainerInstances(
			&ecs.DescribeContainerInstancesInput{
				Cluster:            aws.String(os.Getenv("CLUSTER")),
				ContainerInstances: ires.ContainerInstanceArns,
			},
		)

		if err != nil {
			log.Error(err)
			continue
		}

		cInstanceIds := make([]string, 0)
		cInstanceConnections := make(map[string]bool)

		for _, i := range dres.ContainerInstances {
			cInstanceConnections[*i.Ec2InstanceId] = *i.AgentConnected

			if *i.AgentConnected {
				cInstanceIds = append(cInstanceIds, *i.Ec2InstanceId)
			}
		}

		// Get and Describe Rack ASG Resource
		resources, err := models.ListResources(os.Getenv("RACK"))

		ares, err := models.AutoScaling().DescribeAutoScalingGroups(
			&autoscaling.DescribeAutoScalingGroupsInput{
				AutoScalingGroupNames: []*string{
					aws.String(resources["Instances"].Id),
				},
			},
		)

		if err != nil {
			log.Error(err)
			continue
		}

		// Test if ASG Instance is registered and connected in ECS cluster

		aInstanceIds := []string{}
		uInstanceIds := []string{}

		for _, i := range ares.AutoScalingGroups[0].Instances {
			if connected, exists := cInstanceConnections[*i.InstanceId]; connected && exists {
				aInstanceIds = append(aInstanceIds, *i.InstanceId)
			} else {
				// Not registered or not connected => set Unhealthy
				if *i.LifecycleState == "InService" {
					_, err := models.AutoScaling().SetInstanceHealth(
						&autoscaling.SetInstanceHealthInput{
							HealthStatus:             aws.String("Unhealthy"),
							InstanceId:               aws.String(*i.InstanceId),
							ShouldRespectGracePeriod: aws.Bool(true),
						},
					)

					if err != nil {
						log.Error(err)
						continue
					}

					uInstanceIds = append(uInstanceIds, *i.InstanceId)
				}
			}
		}

		sort.Strings(aInstanceIds)
		sort.Strings(cInstanceIds)
		sort.Strings(uInstanceIds)

		// if len(uInstanceIds) > 0 {
		//   helpers.SendMixpanelEvent("kernel-cluster-monitor-mark", strings.Join(uInstanceIds, ","))
		// }

		log.Log("InstanceCount=%v connected='%v' healthy='%v' marked='%s'", instanceCount, strings.Join(cInstanceIds, ","), strings.Join(aInstanceIds, ","), strings.Join(uInstanceIds, ","))
	}
}
Example #10
0
func (p *AWSProvider) buildWait(a *structs.App, b *structs.Build, cmd *exec.Cmd, stdout io.ReadCloser) {

	// scan all output
	scanner := bufio.NewScanner(stdout)
	out := ""
	for scanner.Scan() {
		text := scanner.Text()
		out += text + "\n"
	}
	if err := scanner.Err(); err != nil {
		helpers.Error(nil, err) // send internal error to rollbar
	}

	var cmdStatus string
	waitErr := make(chan error)
	timeout := time.After(1 * time.Hour)

	go func() {
		err := cmd.Wait()

		switch err.(type) {
		case *exec.ExitError:
			waitErr <- err
		default:
			waitErr <- nil
		}
	}()

	select {

	case werr := <-waitErr:
		// Wait / return code are errors, consider the build failed
		if werr != nil {
			cmdStatus = "failed"
		}

	case <-timeout:
		cmdStatus = "timeout"
		// Force kill the build container since its taking way to long
		killCmd := exec.Command("docker", "kill", fmt.Sprintf("build-%s", b.Id))
		killCmd.Start()
	}

	// reload build item to get data from BuildUpdate callback
	b, err := p.BuildGet(b.App, b.Id)
	if err != nil {
		helpers.Error(nil, err) // send internal error to rollbar
		return
	}

	if cmdStatus != "" { // Careful not to override the status set by BuildUpdate
		b.Status = cmdStatus
	}

	// save final build logs / status
	b.Logs = out
	err = p.BuildSave(b)
	if err != nil {
		helpers.Error(nil, err) // send internal error to rollbar
		return
	}
}