Exemple #1
0
func (instances Instances) describeASG() error {
	resources, err := models.ListResources(os.Getenv("RACK"))

	res, err := models.AutoScaling().DescribeAutoScalingGroups(
		&autoscaling.DescribeAutoScalingGroupsInput{
			AutoScalingGroupNames: []*string{
				aws.String(resources["Instances"].Id),
			},
		},
	)

	if err != nil {
		return err
	}

	for _, i := range res.AutoScalingGroups[0].Instances {
		instance := instances[*i.InstanceId]

		instance.Id = *i.InstanceId
		instance.ASG = *i.LifecycleState == "InService"

		instances[*i.InstanceId] = instance
	}

	return nil
}
Exemple #2
0
func StartCluster() {
	var log = logger.New("ns=cluster_monitor")

	defer recoverWith(func(err error) {
		helpers.Error(log, err)
	})

	for _ = range time.Tick(5 * time.Minute) {
		log.Log("tick")

		instances := Instances{}

		err := instances.describeASG()

		if err != nil {
			log.Error(err)
			continue
		}

		err = instances.describeECS()

		if err != nil {
			log.Error(err)
			continue
		}

		// TODO: Add an instances.testDocker() call to the mission critical path

		// Test if ASG Instance is registered and connected in ECS cluster
		for _, i := range instances {
			if !i.ASG {
				// TODO: Rogue instance?! Terminate?
				continue
			}

			if !i.ECS {
				// Not registered or not connected => set Unhealthy
				_, err := models.AutoScaling().SetInstanceHealth(
					&autoscaling.SetInstanceHealthInput{
						HealthStatus:             aws.String("Unhealthy"),
						InstanceId:               aws.String(i.Id),
						ShouldRespectGracePeriod: aws.Bool(true),
					},
				)

				i.Unhealthy = true

				if err != nil {
					log.Error(err)
					continue
				}
			}
		}

		log.Log(instances.log())
	}
}
Exemple #3
0
func StartCluster() {
	var log = logger.New("ns=cluster_monitor")

	defer recoverWith(func(err error) {
		helpers.Error(log, err)
	})

	for range time.Tick(5 * time.Minute) {
		log.Logf("tick")

		instances := Instances{}

		err := instances.describeASG()
		if err != nil {
			log.Error(err)
			continue
		}

		err = instances.describeECS()
		if err != nil {
			log.Error(err)
			continue
		}

		// Test if ASG Instance is registered and connected in ECS cluster
		for k, i := range instances {
			if !i.ASG {
				// TODO: Rogue instance?! Terminate?
				continue
			}

			if !i.ECS {
				// Not registered or not connected => set Unhealthy
				_, err := models.AutoScaling().SetInstanceHealth(
					&autoscaling.SetInstanceHealthInput{
						HealthStatus:             aws.String("Unhealthy"),
						InstanceId:               aws.String(i.Id),
						ShouldRespectGracePeriod: aws.Bool(true),
					},
				)

				i.Unhealthy = true
				instances[k] = i

				if err != nil {
					log.Error(err)
					continue
				}

				// log for humans
				fmt.Printf("who=\"convox/monitor\" what=\"marked instance %s unhealthy\" why=\"ECS reported agent disconnected\"\n", i.Id)
			}
		}

		log.Logf(instances.log())
	}
}
Exemple #4
0
func (instances Instances) describeASG() error {
	resources, err := models.ListResources(os.Getenv("RACK"))

	res, err := models.AutoScaling().DescribeAutoScalingGroups(
		&autoscaling.DescribeAutoScalingGroupsInput{
			AutoScalingGroupNames: []*string{
				aws.String(resources["Instances"].Id),
			},
		},
	)
	if err != nil {
		return err
	}

	for _, i := range res.AutoScalingGroups[0].Instances {
		instance := instances[*i.InstanceId]

		instance.Id = *i.InstanceId
		instance.ASG = *i.LifecycleState == "InService"

		instances[*i.InstanceId] = instance
	}

	// describe and log every recent ASG activity
	dres, err := models.AutoScaling().DescribeScalingActivities(
		&autoscaling.DescribeScalingActivitiesInput{
			AutoScalingGroupName: aws.String(resources["Instances"].Id),
		},
	)
	if err != nil {
		return nil
	}

	for _, a := range dres.Activities {
		if lastASGActivity.Before(*a.StartTime) {
			fmt.Printf("who=\"EC2/ASG\" what=%q why=%q\n", *a.Description, *a.Cause)
			lastASGActivity = *a.StartTime
		}
	}

	return nil
}
Exemple #5
0
func InstanceTerminate(rw http.ResponseWriter, r *http.Request) *httperr.Error {
	rack, err := models.Provider().SystemGet()

	if awsError(err) == "ValidationError" {
		return httperr.Errorf(404, "no such stack: %s", rack)
	}

	if err != nil {
		return httperr.Server(err)
	}

	instanceId := mux.Vars(r)["id"]

	_, err = models.AutoScaling().TerminateInstanceInAutoScalingGroup(&autoscaling.TerminateInstanceInAutoScalingGroupInput{
		InstanceId:                     aws.String(instanceId),
		ShouldDecrementDesiredCapacity: aws.Bool(false),
	})

	if err != nil {
		return httperr.Server(err)
	}

	return RenderSuccess(rw)
}
Exemple #6
0
func StartCluster() {
	var log = logger.New("ns=cluster_monitor")

	defer recoverWith(func(err error) {
		helpers.Error(log, err)
	})

Tick:
	for _ = range time.Tick(5 * time.Minute) {
		log.Log("tick")

		// Ger Rack InstanceCount Parameter
		instanceCount := 0
		// instanceType := "unknown"

		res, err := models.CloudFormation().DescribeStacks(
			&cloudformation.DescribeStacksInput{
				StackName: aws.String(os.Getenv("RACK")),
			},
		)

		if err != nil {
			log.Error(err)
			continue
		}

		for _, p := range res.Stacks[0].Parameters {
			if *p.ParameterKey == "InstanceCount" {
				c, err := strconv.Atoi(*p.ParameterValue)

				if err != nil {
					log.Error(err)
					break Tick
				}

				instanceCount = c
			}

			// if *p.ParameterKey == "InstanceType" {
			//   instanceType = *p.ParameterValue
			// }
		}

		// helpers.SendMixpanelEvent("kernel-cluster-monitor", fmt.Sprintf("count=%d type=%s", instanceCount, instanceType))

		// List and Describe ECS Container Instances
		ires, err := models.ECS().ListContainerInstances(
			&ecs.ListContainerInstancesInput{
				Cluster: aws.String(os.Getenv("CLUSTER")),
			},
		)

		if err != nil {
			log.Error(err)
			continue
		}

		dres, err := models.ECS().DescribeContainerInstances(
			&ecs.DescribeContainerInstancesInput{
				Cluster:            aws.String(os.Getenv("CLUSTER")),
				ContainerInstances: ires.ContainerInstanceArns,
			},
		)

		if err != nil {
			log.Error(err)
			continue
		}

		cInstanceIds := make([]string, 0)
		cInstanceConnections := make(map[string]bool)

		for _, i := range dres.ContainerInstances {
			cInstanceConnections[*i.Ec2InstanceId] = *i.AgentConnected

			if *i.AgentConnected {
				cInstanceIds = append(cInstanceIds, *i.Ec2InstanceId)
			}
		}

		// Get and Describe Rack ASG Resource
		resources, err := models.ListResources(os.Getenv("RACK"))

		ares, err := models.AutoScaling().DescribeAutoScalingGroups(
			&autoscaling.DescribeAutoScalingGroupsInput{
				AutoScalingGroupNames: []*string{
					aws.String(resources["Instances"].Id),
				},
			},
		)

		if err != nil {
			log.Error(err)
			continue
		}

		// Test if ASG Instance is registered and connected in ECS cluster

		aInstanceIds := []string{}
		uInstanceIds := []string{}

		for _, i := range ares.AutoScalingGroups[0].Instances {
			if connected, exists := cInstanceConnections[*i.InstanceId]; connected && exists {
				aInstanceIds = append(aInstanceIds, *i.InstanceId)
			} else {
				// Not registered or not connected => set Unhealthy
				if *i.LifecycleState == "InService" {
					_, err := models.AutoScaling().SetInstanceHealth(
						&autoscaling.SetInstanceHealthInput{
							HealthStatus:             aws.String("Unhealthy"),
							InstanceId:               aws.String(*i.InstanceId),
							ShouldRespectGracePeriod: aws.Bool(true),
						},
					)

					if err != nil {
						log.Error(err)
						continue
					}

					uInstanceIds = append(uInstanceIds, *i.InstanceId)
				}
			}
		}

		sort.Strings(aInstanceIds)
		sort.Strings(cInstanceIds)
		sort.Strings(uInstanceIds)

		// if len(uInstanceIds) > 0 {
		//   helpers.SendMixpanelEvent("kernel-cluster-monitor-mark", strings.Join(uInstanceIds, ","))
		// }

		log.Log("InstanceCount=%v connected='%v' healthy='%v' marked='%s'", instanceCount, strings.Join(cInstanceIds, ","), strings.Join(aInstanceIds, ","), strings.Join(uInstanceIds, ","))
	}
}