func StartHeartbeat() { log := logger.New("ns=heartbeat") defer recoverWith(func(err error) { helpers.Error(log, err) }) helpers.TrackEvent("kernel-heartbeat", "") for _ = range time.Tick(1 * time.Hour) { helpers.TrackEvent("kernel-heartbeat", "") } }
func (a *App) Delete() error { helpers.TrackEvent("kernel-app-delete-start", nil) _, err := CloudFormation().DeleteStack(&cloudformation.DeleteStackInput{StackName: aws.String(a.StackName())}) if err != nil { helpers.TrackEvent("kernel-app-delete-error", nil) return err } go a.Cleanup() NotifySuccess("app:delete", map[string]string{"name": a.Name}) return nil }
// AppDelete deletes an app func (p *AWSProvider) AppDelete(name string) error { app, err := p.AppGet(name) if err != nil { return err } services, err := p.ServiceList() if err != nil { return err } for _, s := range services { s.Apps, err = p.serviceApps(s) if err != nil { return err } for _, a := range s.Apps { if a.Name == name { return fmt.Errorf("app is linked to %s service", s.Name) } } } _, err = p.cloudformation().DeleteStack(&cloudformation.DeleteStackInput{StackName: aws.String(app.StackName())}) if err != nil { helpers.TrackEvent("kernel-app-delete-error", nil) return err } go p.cleanup(app) return nil }
func (r *System) Save() error { rack := os.Getenv("RACK") app, err := GetApp(rack) if err != nil { return err } mac, err := maxAppConcurrency() // dont scale the rack below the max concurrency plus one // see formation.go for more details if err == nil && r.Count < (mac+1) { return fmt.Errorf("max process concurrency is %d, can't scale rack below %d instances", mac, mac+1) } params := map[string]string{ "InstanceCount": strconv.Itoa(r.Count), "InstanceType": r.Type, "Version": r.Version, } // Report cluster size change helpers.TrackEvent("kernel-cluster-monitor", fmt.Sprintf("count=%d type=%s", r.Count, r.Type)) template := fmt.Sprintf("https://convox.s3.amazonaws.com/release/%s/formation.json", r.Version) return app.UpdateParamsAndTemplate(params, template) }
func (a *App) Delete() error { helpers.TrackEvent("kernel-app-delete-start", "") name := a.Name _, err := CloudFormation().DeleteStack(&cloudformation.DeleteStackInput{StackName: aws.String(name)}) if err != nil { helpers.TrackEvent("kernel-app-delete-error", "") return err } go a.Cleanup() helpers.TrackEvent("kernel-app-delete-success", "") return nil }
func (a *App) Delete() error { helpers.TrackEvent("kernel-app-delete-start", nil) err := Provider().AppDelete(a.Name) if err != nil { return err } NotifySuccess("app:delete", map[string]string{"name": a.Name}) return nil }
// EventSend publishes an important message out to the world. // // On AWS messages are published to SNS. The Rack has an HTTP endpoint that is an SNS // subscription, and when a message is delivered forwards them to all configured // webhook services. // // Often the Rack has a Console webhook which facilitates forwarding events // to Slack with additional formatting and filtering. // // Because these are important system events, they are also published to Segment // for operational metrics. func (p *AWSProvider) EventSend(e *structs.Event, err error) error { // log := logger.New("ns=kernel") e.Status = "success" e.Timestamp = time.Now().UTC() if p.IsTest() { e.Timestamp = time.Time{} } if err != nil { e.Data["message"] = err.Error() e.Status = "error" } msg, err := json.Marshal(e) if err != nil { // helpers.Error(log, err) // report internal errors to Rollbar return err } fmt.Printf("aws EventSend msg=%q\n", msg) // Publish Event to SNS _, err = p.sns().Publish(&sns.PublishInput{ Message: aws.String(string(msg)), // Required Subject: aws.String(e.Action), TargetArn: aws.String(p.NotificationTopic), }) if err != nil { // helpers.Error(log, err) // report internal errors to Rollbar return err } // log.At("EventSend").Log("message-id=%q", *resp.MessageId) // report event to Segment params := map[string]interface{}{ "action": e.Action, "status": e.Status, } for k, v := range e.Data { params[k] = v } helpers.TrackEvent("event", params) return nil }
// AppDelete deletes an app func (p *AWSProvider) AppDelete(name string) error { app, err := p.AppGet(name) if err != nil { return err } _, err = p.cloudformation().DeleteStack(&cloudformation.DeleteStackInput{StackName: aws.String(app.StackName())}) if err != nil { helpers.TrackEvent("kernel-app-delete-error", nil) return err } go p.cleanup(app) return nil }
// EventSend publishes an important message out to the world. // // On AWS messages are published to SNS. The Rack has an HTTP endpoint that is an SNS // subscription, and when a message is delivered forwards them to all configured // webhook services. // // Often the Rack has a Console webhook which facilitates forwarding events // to Slack with additional formatting and filtering. // // Because these are important system events, they are also published to Segment // for operational metrics. func (p *AWSProvider) EventSend(e *structs.Event, err error) error { e.Status = "success" e.Timestamp = time.Now().UTC() if p.IsTest() { e.Timestamp = time.Time{} } if err != nil { e.Data["message"] = err.Error() e.Status = "error" } msg, err := json.Marshal(e) if err != nil { return err } // Publish Event to SNS _, err = p.sns().Publish(&sns.PublishInput{ Message: aws.String(string(msg)), // Required Subject: aws.String(e.Action), TargetArn: aws.String(p.NotificationTopic), }) if err != nil { return err } // report event to Segment params := map[string]interface{}{ "action": e.Action, "status": e.Status, } for k, v := range e.Data { params[k] = v } helpers.TrackEvent("event", params) return nil }
func heartbeat() { system, err := models.GetSystem() if err != nil { log.Error(err) return } apps, err := models.ListApps() if err != nil { log.Error(err) return } helpers.TrackEvent("kernel-heartbeat", map[string]interface{}{ "app_count": len(apps), "instance_count": system.Count, "instance_type": system.Type, "region": os.Getenv("AWS_REGION"), "version": system.Version, }) }
func (a *App) Create() error { helpers.TrackEvent("kernel-app-create-start", nil) if !regexValidAppName.MatchString(a.Name) { return fmt.Errorf("app name can contain only alphanumeric characters and dashes and must be between 4 and 30 characters") } formation, err := a.Formation() if err != nil { helpers.TrackEvent("kernel-app-create-error", nil) return err } params := map[string]string{ "Cluster": os.Getenv("CLUSTER"), "Subnets": os.Getenv("SUBNETS"), "Version": os.Getenv("RELEASE"), "VPC": os.Getenv("VPC"), } if os.Getenv("ENCRYPTION_KEY") != "" { params["Key"] = os.Getenv("ENCRYPTION_KEY") } tags := map[string]string{ "Rack": os.Getenv("RACK"), "System": "convox", "Type": "app", } req := &cloudformation.CreateStackInput{ Capabilities: []*string{aws.String("CAPABILITY_IAM")}, StackName: aws.String(a.Name), TemplateBody: aws.String(formation), } for key, value := range params { req.Parameters = append(req.Parameters, &cloudformation.Parameter{ ParameterKey: aws.String(key), ParameterValue: aws.String(value), }) } for key, value := range tags { req.Tags = append(req.Tags, &cloudformation.Tag{ Key: aws.String(key), Value: aws.String(value), }) } _, err = CloudFormation().CreateStack(req) if err != nil { helpers.TrackEvent("kernel-app-create-error", nil) return err } helpers.TrackEvent("kernel-app-create-success", nil) NotifySuccess("app:create", map[string]string{"name": a.Name}) return nil }
func (a *App) Cleanup() error { err := cleanupBucket(a.Outputs["Settings"]) if err != nil { return err } builds, err := ListBuilds(a.Name) if err != nil { return err } for _, build := range builds { go cleanupBuild(build) } releases, err := ListReleases(a.Name) if err != nil { return err } for _, release := range releases { go cleanupRelease(release) } // monitor and stack deletion state for up to 10 minutes // retry once if DELETE_FAILED to automate around transient errors // send delete success event only when stack is gone shouldRetry := true for i := 0; i < 60; i++ { res, err := CloudFormation().DescribeStacks(&cloudformation.DescribeStacksInput{ StackName: aws.String(a.StackName()), }) // return when stack is not found indicating successful delete if ae, ok := err.(awserr.Error); ok { if ae.Code() == "ValidationError" { helpers.TrackEvent("kernel-app-delete-success", nil) // Last ditch effort to remove the empty bucket CF leaves behind. _, err := S3().DeleteBucket(&s3.DeleteBucketInput{Bucket: aws.String(a.Outputs["Settings"])}) if err != nil { fmt.Printf("error: %s\n", err) } return nil } } if err == nil && len(res.Stacks) == 1 && shouldRetry { // if delete failed, issue one more delete stack and return s := res.Stacks[0] if *s.StackStatus == "DELETE_FAILED" { helpers.TrackEvent("kernel-app-delete-retry", nil) _, err := CloudFormation().DeleteStack(&cloudformation.DeleteStackInput{StackName: aws.String(a.StackName())}) if err != nil { helpers.TrackEvent("kernel-app-delete-retry-error", nil) } else { shouldRetry = false } } } time.Sleep(10 * time.Second) } return nil }
func (a *App) Create() error { helpers.TrackEvent("kernel-app-create-start", nil) if !regexValidAppName.MatchString(a.Name) { return fmt.Errorf("app name can contain only alphanumeric characters, dashes and must be between 4 and 30 characters") } m := manifest.Manifest{ Services: make(map[string]manifest.Service), } formation, err := a.Formation(m) if err != nil { helpers.TrackEvent("kernel-app-create-error", nil) return err } // SubnetsPrivate is a List<AWS::EC2::Subnet::Id> and can not be empty // So reuse SUBNETS if SUBNETS_PRIVATE is not set subnetsPrivate := os.Getenv("SUBNETS_PRIVATE") if subnetsPrivate == "" { subnetsPrivate = os.Getenv("SUBNETS") } params := map[string]string{ "Cluster": os.Getenv("CLUSTER"), "Internal": os.Getenv("INTERNAL"), "Private": os.Getenv("PRIVATE"), "Subnets": os.Getenv("SUBNETS"), "SubnetsPrivate": subnetsPrivate, "Version": os.Getenv("RELEASE"), "VPC": os.Getenv("VPC"), "VPCCIDR": os.Getenv("VPCCIDR"), } if os.Getenv("ENCRYPTION_KEY") != "" { params["Key"] = os.Getenv("ENCRYPTION_KEY") } tags := map[string]string{ "Rack": os.Getenv("RACK"), "System": "convox", "Type": "app", "Name": a.Name, } req := &cloudformation.CreateStackInput{ Capabilities: []*string{aws.String("CAPABILITY_IAM")}, StackName: aws.String(a.StackName()), TemplateBody: aws.String(formation), } for key, value := range params { req.Parameters = append(req.Parameters, &cloudformation.Parameter{ ParameterKey: aws.String(key), ParameterValue: aws.String(value), }) } for key, value := range tags { req.Tags = append(req.Tags, &cloudformation.Tag{ Key: aws.String(key), Value: aws.String(value), }) } _, err = CloudFormation().CreateStack(req) if err != nil { helpers.TrackEvent("kernel-app-create-error", nil) return err } helpers.TrackEvent("kernel-app-create-success", nil) NotifySuccess("app:create", map[string]string{"name": a.Name}) return nil }
func StartCluster() { var log = logger.New("ns=cluster_monitor") defer recoverWith(func(err error) { helpers.Error(log, err) }) // Report cluster size one time on start system, err := models.GetSystem() if err != nil { log.Error(err) } else { helpers.TrackEvent("kernel-cluster-monitor", fmt.Sprintf("count=%d type=%s", system.Count, system.Type)) } for _ = range time.Tick(5 * time.Minute) { log.Log("tick") instances := Instances{} err := instances.describeASG() if err != nil { log.Error(err) continue } err = instances.describeECS() if err != nil { log.Error(err) continue } // TODO: Add an instances.testDocker() call to the mission critical path // Test if ASG Instance is registered and connected in ECS cluster for _, i := range instances { if !i.ASG { // TODO: Rogue instance?! Terminate? continue } if !i.ECS { // Not registered or not connected => set Unhealthy _, err := models.AutoScaling().SetInstanceHealth( &autoscaling.SetInstanceHealthInput{ HealthStatus: aws.String("Unhealthy"), InstanceId: aws.String(i.Id), ShouldRespectGracePeriod: aws.Bool(true), }, ) i.Unhealthy = true if err != nil { log.Error(err) continue } } } log.Log(instances.log()) } }
// cleanup deletes AWS resources that aren't handled by the CloudFormation during stack deletion. func (p *AWSProvider) cleanup(app *structs.App) error { err := p.deleteBucket(app.Outputs["Settings"]) if err != nil { fmt.Printf("fn=cleanup level=error msg=\"%s\"", err) return err } err = p.buildsDeleteAll(app) if err != nil { fmt.Printf("fn=cleanup level=error msg=\"%s\"", err) return err } _, err = p.ecr().DeleteRepository(&ecr.DeleteRepositoryInput{ RepositoryName: aws.String(app.Outputs["RegistryRepository"]), Force: aws.Bool(true), }) if err != nil { fmt.Printf("fn=cleanup level=error msg=\"error deleting ecr repo: %s\"", err) } err = p.releaseDeleteAll(app.Name) if err != nil { fmt.Printf("fn=cleanup level=error msg=\"%s\"", err) return err } // monitor and stack deletion state for up to 10 minutes // retry once if DELETE_FAILED to automate around transient errors // send delete success event only when stack is gone shouldRetry := true for i := 0; i < 60; i++ { res, err := p.cloudformation().DescribeStacks(&cloudformation.DescribeStacksInput{ StackName: aws.String(app.StackName()), }) // return when stack is not found indicating successful delete if ae, ok := err.(awserr.Error); ok { if ae.Code() == "ValidationError" { // Error indicates stack wasn't found, hence deleted. helpers.TrackEvent("kernel-app-delete-success", nil) // Last ditch effort to remove the empty bucket CF leaves behind. _, err := p.s3().DeleteBucket(&s3.DeleteBucketInput{Bucket: aws.String(app.Outputs["Settings"])}) if err != nil { fmt.Printf("last ditch effort bucket error: %s\n", err) } return nil } } if err == nil && len(res.Stacks) == 1 && shouldRetry { // if delete failed, issue one more delete stack and return s := res.Stacks[0] if *s.StackStatus == "DELETE_FAILED" { helpers.TrackEvent("kernel-app-delete-retry", nil) _, err := p.cloudformation().DeleteStack(&cloudformation.DeleteStackInput{StackName: aws.String(app.StackName())}) if err != nil { helpers.TrackEvent("kernel-app-delete-retry-error", nil) } else { shouldRetry = false } } } time.Sleep(10 * time.Second) } return nil }