func (instances Instances) describeASG() error { resources, err := models.ListResources(os.Getenv("RACK")) res, err := models.AutoScaling().DescribeAutoScalingGroups( &autoscaling.DescribeAutoScalingGroupsInput{ AutoScalingGroupNames: []*string{ aws.String(resources["Instances"].Id), }, }, ) if err != nil { return err } for _, i := range res.AutoScalingGroups[0].Instances { instance := instances[*i.InstanceId] instance.Id = *i.InstanceId instance.ASG = *i.LifecycleState == "InService" instances[*i.InstanceId] = instance } return nil }
func StartCluster() { var log = logger.New("ns=cluster_monitor") defer recoverWith(func(err error) { helpers.Error(log, err) }) for _ = range time.Tick(5 * time.Minute) { log.Log("tick") instances := Instances{} err := instances.describeASG() if err != nil { log.Error(err) continue } err = instances.describeECS() if err != nil { log.Error(err) continue } // TODO: Add an instances.testDocker() call to the mission critical path // Test if ASG Instance is registered and connected in ECS cluster for _, i := range instances { if !i.ASG { // TODO: Rogue instance?! Terminate? continue } if !i.ECS { // Not registered or not connected => set Unhealthy _, err := models.AutoScaling().SetInstanceHealth( &autoscaling.SetInstanceHealthInput{ HealthStatus: aws.String("Unhealthy"), InstanceId: aws.String(i.Id), ShouldRespectGracePeriod: aws.Bool(true), }, ) i.Unhealthy = true if err != nil { log.Error(err) continue } } } log.Log(instances.log()) } }
func StartCluster() { var log = logger.New("ns=cluster_monitor") defer recoverWith(func(err error) { helpers.Error(log, err) }) for range time.Tick(5 * time.Minute) { log.Logf("tick") instances := Instances{} err := instances.describeASG() if err != nil { log.Error(err) continue } err = instances.describeECS() if err != nil { log.Error(err) continue } // Test if ASG Instance is registered and connected in ECS cluster for k, i := range instances { if !i.ASG { // TODO: Rogue instance?! Terminate? continue } if !i.ECS { // Not registered or not connected => set Unhealthy _, err := models.AutoScaling().SetInstanceHealth( &autoscaling.SetInstanceHealthInput{ HealthStatus: aws.String("Unhealthy"), InstanceId: aws.String(i.Id), ShouldRespectGracePeriod: aws.Bool(true), }, ) i.Unhealthy = true instances[k] = i if err != nil { log.Error(err) continue } // log for humans fmt.Printf("who=\"convox/monitor\" what=\"marked instance %s unhealthy\" why=\"ECS reported agent disconnected\"\n", i.Id) } } log.Logf(instances.log()) } }
func (instances Instances) describeASG() error { resources, err := models.ListResources(os.Getenv("RACK")) res, err := models.AutoScaling().DescribeAutoScalingGroups( &autoscaling.DescribeAutoScalingGroupsInput{ AutoScalingGroupNames: []*string{ aws.String(resources["Instances"].Id), }, }, ) if err != nil { return err } for _, i := range res.AutoScalingGroups[0].Instances { instance := instances[*i.InstanceId] instance.Id = *i.InstanceId instance.ASG = *i.LifecycleState == "InService" instances[*i.InstanceId] = instance } // describe and log every recent ASG activity dres, err := models.AutoScaling().DescribeScalingActivities( &autoscaling.DescribeScalingActivitiesInput{ AutoScalingGroupName: aws.String(resources["Instances"].Id), }, ) if err != nil { return nil } for _, a := range dres.Activities { if lastASGActivity.Before(*a.StartTime) { fmt.Printf("who=\"EC2/ASG\" what=%q why=%q\n", *a.Description, *a.Cause) lastASGActivity = *a.StartTime } } return nil }
func InstanceTerminate(rw http.ResponseWriter, r *http.Request) *httperr.Error { rack, err := models.Provider().SystemGet() if awsError(err) == "ValidationError" { return httperr.Errorf(404, "no such stack: %s", rack) } if err != nil { return httperr.Server(err) } instanceId := mux.Vars(r)["id"] _, err = models.AutoScaling().TerminateInstanceInAutoScalingGroup(&autoscaling.TerminateInstanceInAutoScalingGroupInput{ InstanceId: aws.String(instanceId), ShouldDecrementDesiredCapacity: aws.Bool(false), }) if err != nil { return httperr.Server(err) } return RenderSuccess(rw) }
func StartCluster() { var log = logger.New("ns=cluster_monitor") defer recoverWith(func(err error) { helpers.Error(log, err) }) Tick: for _ = range time.Tick(5 * time.Minute) { log.Log("tick") // Ger Rack InstanceCount Parameter instanceCount := 0 // instanceType := "unknown" res, err := models.CloudFormation().DescribeStacks( &cloudformation.DescribeStacksInput{ StackName: aws.String(os.Getenv("RACK")), }, ) if err != nil { log.Error(err) continue } for _, p := range res.Stacks[0].Parameters { if *p.ParameterKey == "InstanceCount" { c, err := strconv.Atoi(*p.ParameterValue) if err != nil { log.Error(err) break Tick } instanceCount = c } // if *p.ParameterKey == "InstanceType" { // instanceType = *p.ParameterValue // } } // helpers.SendMixpanelEvent("kernel-cluster-monitor", fmt.Sprintf("count=%d type=%s", instanceCount, instanceType)) // List and Describe ECS Container Instances ires, err := models.ECS().ListContainerInstances( &ecs.ListContainerInstancesInput{ Cluster: aws.String(os.Getenv("CLUSTER")), }, ) if err != nil { log.Error(err) continue } dres, err := models.ECS().DescribeContainerInstances( &ecs.DescribeContainerInstancesInput{ Cluster: aws.String(os.Getenv("CLUSTER")), ContainerInstances: ires.ContainerInstanceArns, }, ) if err != nil { log.Error(err) continue } cInstanceIds := make([]string, 0) cInstanceConnections := make(map[string]bool) for _, i := range dres.ContainerInstances { cInstanceConnections[*i.Ec2InstanceId] = *i.AgentConnected if *i.AgentConnected { cInstanceIds = append(cInstanceIds, *i.Ec2InstanceId) } } // Get and Describe Rack ASG Resource resources, err := models.ListResources(os.Getenv("RACK")) ares, err := models.AutoScaling().DescribeAutoScalingGroups( &autoscaling.DescribeAutoScalingGroupsInput{ AutoScalingGroupNames: []*string{ aws.String(resources["Instances"].Id), }, }, ) if err != nil { log.Error(err) continue } // Test if ASG Instance is registered and connected in ECS cluster aInstanceIds := []string{} uInstanceIds := []string{} for _, i := range ares.AutoScalingGroups[0].Instances { if connected, exists := cInstanceConnections[*i.InstanceId]; connected && exists { aInstanceIds = append(aInstanceIds, *i.InstanceId) } else { // Not registered or not connected => set Unhealthy if *i.LifecycleState == "InService" { _, err := models.AutoScaling().SetInstanceHealth( &autoscaling.SetInstanceHealthInput{ HealthStatus: aws.String("Unhealthy"), InstanceId: aws.String(*i.InstanceId), ShouldRespectGracePeriod: aws.Bool(true), }, ) if err != nil { log.Error(err) continue } uInstanceIds = append(uInstanceIds, *i.InstanceId) } } } sort.Strings(aInstanceIds) sort.Strings(cInstanceIds) sort.Strings(uInstanceIds) // if len(uInstanceIds) > 0 { // helpers.SendMixpanelEvent("kernel-cluster-monitor-mark", strings.Join(uInstanceIds, ",")) // } log.Log("InstanceCount=%v connected='%v' healthy='%v' marked='%s'", instanceCount, strings.Join(cInstanceIds, ","), strings.Join(aInstanceIds, ","), strings.Join(uInstanceIds, ",")) } }