// EventSend publishes an important message out to the world. // // On AWS messages are published to SNS. The Rack has an HTTP endpoint that is an SNS // subscription, and when a message is delivered forwards them to all configured // webhook services. // // Often the Rack has a Console webhook which facilitates forwarding events // to Slack with additional formatting and filtering. // // Because these are important system events, they are also published to Segment // for operational metrics. func (p *AWSProvider) EventSend(e *structs.Event, err error) error { log := logger.New("ns=kernel") e.Status = "success" e.Timestamp = time.Now().UTC() if err != nil { e.Data["message"] = err.Error() e.Status = "error" } msg, err := json.Marshal(e) if err != nil { helpers.Error(log, err) // report internal errors to Rollbar return err } fmt.Printf("aws EventSend msg=%q\n", msg) // Publish Event to SNS resp, err := p.sns().Publish(&sns.PublishInput{ Message: aws.String(string(msg)), // Required Subject: aws.String(e.Action), TargetArn: aws.String(os.Getenv("NOTIFICATION_TOPIC")), }) if err != nil { helpers.Error(log, err) // report internal errors to Rollbar return err } log.At("EventSend").Log("message-id=%q", *resp.MessageId) // report event to Segment params := map[string]interface{}{ "action": e.Action, "status": e.Status, } for k, v := range e.Data { params[k] = v } helpers.TrackEvent("event", params) return nil }
func recovery(rw http.ResponseWriter, r *http.Request, next http.HandlerFunc) { defer recoverWith(func(err error) { log := logger.New("ns=kernel").At("panic") helpers.Error(log, err) http.Error(rw, err.Error(), http.StatusInternalServerError) }) next(rw, r) }
func StartCluster() { var log = logger.New("ns=cluster_monitor") defer recoverWith(func(err error) { helpers.Error(log, err) }) for _ = range time.Tick(5 * time.Minute) { log.Log("tick") instances := Instances{} err := instances.describeASG() if err != nil { log.Error(err) continue } err = instances.describeECS() if err != nil { log.Error(err) continue } // TODO: Add an instances.testDocker() call to the mission critical path // Test if ASG Instance is registered and connected in ECS cluster for _, i := range instances { if !i.ASG { // TODO: Rogue instance?! Terminate? continue } if !i.ECS { // Not registered or not connected => set Unhealthy _, err := models.AutoScaling().SetInstanceHealth( &autoscaling.SetInstanceHealthInput{ HealthStatus: aws.String("Unhealthy"), InstanceId: aws.String(i.Id), ShouldRespectGracePeriod: aws.Bool(true), }, ) i.Unhealthy = true if err != nil { log.Error(err) continue } } } log.Log(instances.log()) } }
func StartCluster() { var log = logger.New("ns=cluster_monitor") defer recoverWith(func(err error) { helpers.Error(log, err) }) for range time.Tick(5 * time.Minute) { log.Logf("tick") instances := Instances{} err := instances.describeASG() if err != nil { log.Error(err) continue } err = instances.describeECS() if err != nil { log.Error(err) continue } // Test if ASG Instance is registered and connected in ECS cluster for k, i := range instances { if !i.ASG { // TODO: Rogue instance?! Terminate? continue } if !i.ECS { // Not registered or not connected => set Unhealthy _, err := models.AutoScaling().SetInstanceHealth( &autoscaling.SetInstanceHealthInput{ HealthStatus: aws.String("Unhealthy"), InstanceId: aws.String(i.Id), ShouldRespectGracePeriod: aws.Bool(true), }, ) i.Unhealthy = true instances[k] = i if err != nil { log.Error(err) continue } // log for humans fmt.Printf("who=\"convox/monitor\" what=\"marked instance %s unhealthy\" why=\"ECS reported agent disconnected\"\n", i.Id) } } log.Logf(instances.log()) } }
func (p *AWSProvider) buildWait(a *structs.App, b *structs.Build, cmd *exec.Cmd, stdout io.ReadCloser) { // scan all output out := "" scanner := bufio.NewScanner(stdout) for scanner.Scan() { text := scanner.Text() out += text + "\n" p.kinesis().PutRecord(&kinesis.PutRecordInput{ Data: []byte(text), PartitionKey: aws.String(string(time.Now().UnixNano())), StreamName: aws.String(a.Outputs["Kinesis"]), }) } if err := scanner.Err(); err != nil { helpers.Error(nil, err) // send internal error to rollbar } // and wait for a return code werr := cmd.Wait() // reload build item to get data from BuildUpdate callback b, err := p.BuildGet(b.App, b.Id) if err != nil { helpers.Error(nil, err) // send internal error to rollbar return } // Wait / return code are errors, consider the build failed if werr != nil { b.Status = "failed" } // save final build logs / status b.Logs = string(out) err = p.BuildSave(b) if err != nil { helpers.Error(nil, err) // send internal error to rollbar return } }
func StartHeartbeat() { log := logger.New("ns=heartbeat") defer recoverWith(func(err error) { helpers.Error(log, err) }) helpers.TrackEvent("kernel-heartbeat", "") for _ = range time.Tick(1 * time.Hour) { helpers.TrackEvent("kernel-heartbeat", "") } }
func StartHeartbeat() { log := logger.New("ns=heartbeat") defer recoverWith(func(err error) { helpers.Error(log, err) }) heartbeat() for range time.Tick(1 * time.Hour) { heartbeat() } }
func (p *AWSProvider) buildRun(a *structs.App, b *structs.Build, args []string, env []string, stdin io.Reader) error { cmd := exec.Command("docker", args...) cmd.Env = env cmd.Stdin = stdin cmd.Stderr = cmd.Stdout // redirect cmd stderr to stdout stdout, err := cmd.StdoutPipe() if err != nil { helpers.Error(nil, err) // send internal error to rollbar return err } // start build command err = cmd.Start() if err != nil { helpers.Error(nil, err) // send internal error to rollbar return err } go p.buildWait(a, b, cmd, stdout) return nil }
func StartCluster() { var log = logger.New("ns=cluster_monitor") defer recoverWith(func(err error) { helpers.Error(log, err) }) Tick: for _ = range time.Tick(5 * time.Minute) { log.Log("tick") // Ger Rack InstanceCount Parameter instanceCount := 0 // instanceType := "unknown" res, err := models.CloudFormation().DescribeStacks( &cloudformation.DescribeStacksInput{ StackName: aws.String(os.Getenv("RACK")), }, ) if err != nil { log.Error(err) continue } for _, p := range res.Stacks[0].Parameters { if *p.ParameterKey == "InstanceCount" { c, err := strconv.Atoi(*p.ParameterValue) if err != nil { log.Error(err) break Tick } instanceCount = c } // if *p.ParameterKey == "InstanceType" { // instanceType = *p.ParameterValue // } } // helpers.SendMixpanelEvent("kernel-cluster-monitor", fmt.Sprintf("count=%d type=%s", instanceCount, instanceType)) // List and Describe ECS Container Instances ires, err := models.ECS().ListContainerInstances( &ecs.ListContainerInstancesInput{ Cluster: aws.String(os.Getenv("CLUSTER")), }, ) if err != nil { log.Error(err) continue } dres, err := models.ECS().DescribeContainerInstances( &ecs.DescribeContainerInstancesInput{ Cluster: aws.String(os.Getenv("CLUSTER")), ContainerInstances: ires.ContainerInstanceArns, }, ) if err != nil { log.Error(err) continue } cInstanceIds := make([]string, 0) cInstanceConnections := make(map[string]bool) for _, i := range dres.ContainerInstances { cInstanceConnections[*i.Ec2InstanceId] = *i.AgentConnected if *i.AgentConnected { cInstanceIds = append(cInstanceIds, *i.Ec2InstanceId) } } // Get and Describe Rack ASG Resource resources, err := models.ListResources(os.Getenv("RACK")) ares, err := models.AutoScaling().DescribeAutoScalingGroups( &autoscaling.DescribeAutoScalingGroupsInput{ AutoScalingGroupNames: []*string{ aws.String(resources["Instances"].Id), }, }, ) if err != nil { log.Error(err) continue } // Test if ASG Instance is registered and connected in ECS cluster aInstanceIds := []string{} uInstanceIds := []string{} for _, i := range ares.AutoScalingGroups[0].Instances { if connected, exists := cInstanceConnections[*i.InstanceId]; connected && exists { aInstanceIds = append(aInstanceIds, *i.InstanceId) } else { // Not registered or not connected => set Unhealthy if *i.LifecycleState == "InService" { _, err := models.AutoScaling().SetInstanceHealth( &autoscaling.SetInstanceHealthInput{ HealthStatus: aws.String("Unhealthy"), InstanceId: aws.String(*i.InstanceId), ShouldRespectGracePeriod: aws.Bool(true), }, ) if err != nil { log.Error(err) continue } uInstanceIds = append(uInstanceIds, *i.InstanceId) } } } sort.Strings(aInstanceIds) sort.Strings(cInstanceIds) sort.Strings(uInstanceIds) // if len(uInstanceIds) > 0 { // helpers.SendMixpanelEvent("kernel-cluster-monitor-mark", strings.Join(uInstanceIds, ",")) // } log.Log("InstanceCount=%v connected='%v' healthy='%v' marked='%s'", instanceCount, strings.Join(cInstanceIds, ","), strings.Join(aInstanceIds, ","), strings.Join(uInstanceIds, ",")) } }
func (p *AWSProvider) buildWait(a *structs.App, b *structs.Build, cmd *exec.Cmd, stdout io.ReadCloser) { // scan all output scanner := bufio.NewScanner(stdout) out := "" for scanner.Scan() { text := scanner.Text() out += text + "\n" } if err := scanner.Err(); err != nil { helpers.Error(nil, err) // send internal error to rollbar } var cmdStatus string waitErr := make(chan error) timeout := time.After(1 * time.Hour) go func() { err := cmd.Wait() switch err.(type) { case *exec.ExitError: waitErr <- err default: waitErr <- nil } }() select { case werr := <-waitErr: // Wait / return code are errors, consider the build failed if werr != nil { cmdStatus = "failed" } case <-timeout: cmdStatus = "timeout" // Force kill the build container since its taking way to long killCmd := exec.Command("docker", "kill", fmt.Sprintf("build-%s", b.Id)) killCmd.Start() } // reload build item to get data from BuildUpdate callback b, err := p.BuildGet(b.App, b.Id) if err != nil { helpers.Error(nil, err) // send internal error to rollbar return } if cmdStatus != "" { // Careful not to override the status set by BuildUpdate b.Status = cmdStatus } // save final build logs / status b.Logs = out err = p.BuildSave(b) if err != nil { helpers.Error(nil, err) // send internal error to rollbar return } }