Пример #1
0
func ClusterSizeCheck() check.ExtensionCheckResult {
	nodes, err := nodesInfo()

	if err != nil {
		return handler.Error(fmt.Sprintf("rabbitmq: %s", err.Error()))
	}

	expected := CLUSTER_SIZE_EXPECTED

	if os.Getenv("CLUSTER_SIZE_EXPECTED") != "" {
		e, err := strconv.Atoi(
			os.Getenv("CLUSTER_SIZE_EXPECTED"),
		)

		if err != nil {
			return handler.Error(fmt.Sprintf("cluster-size error: %s", err.Error()))
		}

		expected = e
	}

	runningNodes := 0

	for _, node := range nodes {
		if node.IsRunning {
			runningNodes++
		}
	}

	if runningNodes < expected {
		return handler.Error(fmt.Sprintf("Cluster too small: %d", runningNodes))
	}

	return handler.Ok(fmt.Sprintf("Cluster ok: %d", runningNodes))
}
Пример #2
0
func AWSCheck() check.ExtensionCheckResult {
	client := buildEc2Client()

	r, err := client.DescribeInstanceStatus(&ec2.DescribeInstanceStatus{}, nil)

	if err != nil {
		return handler.Error(fmt.Sprintf("aws: %s", err.Error()))
	}

	failedInstances := []string{}

	for _, status := range r.InstanceStatus {
		log.Println(status.InstanceId)
		if status.InstanceState.Code == 16 &&
			(status.SystemStatus.Status != "ok" ||
				status.InstanceStatus.Status != "ok") {
			failedInstances = append(failedInstances, status.InstanceId)
		}
	}

	if len(failedInstances) == 0 {
		return handler.Ok("Every instances are running")
	} else {
		return handler.Error(fmt.Sprintf(
			"Instances dead: %s",
			strings.Join(failedInstances, ","),
		))
	}
}
Пример #3
0
func MachineCheck() check.ExtensionCheckResult {
	etcdClient := NewEtcdClient()

	fleetClient, err := NewFleetClient()
	if err != nil {
		return handler.Error(err.Error())
	}

	machines, err := fleetClient.Machines()
	if err != nil {
		return handler.Error(err.Error())
	}

	machineIDs := []string{}
	for _, m := range machines {
		machineIDs = append(machineIDs, m.ID)
	}

	r, err := etcdClient.Get("/machines", false, false)
	if err != nil {
		return handler.Error(err.Error())
	}

	missingIDs := []string{}

	for _, n := range r.Node.Nodes {
		keySlices := strings.Split(n.Key, "/")
		id := keySlices[len(keySlices)-1]

		found := false
		for _, mid := range machineIDs {
			if id == mid {
				found = true
			}
		}

		if found {
			continue
		}

		h, err := etcdClient.Get(
			fmt.Sprintf("/machines/%s/hostname", id),
			false,
			false,
		)

		if err == nil {
			missingIDs = append(missingIDs, h.Node.Value)
		}
	}

	if len(missingIDs) > 0 {
		return handler.Error(
			fmt.Sprintf("Misssing nodes: %s", strings.Join(missingIDs, ",")),
		)
	} else {
		return handler.Ok("Every nodes are up and running")
	}
}
Пример #4
0
func ThriftCheck() check.ExtensionCheckResult {
	machines := []string{}

	if os.Getenv("ETCD_URL") == "" {
		machines = append(machines, "http://172.17.42.1:2379")
	} else {
		machines = strings.Split(os.Getenv("ETCD_URL"), ",")
	}

	etcdClient := etcd.NewClient(machines)

	resp, err := etcdClient.Get("/sensu/services", false, true)

	if err != nil {
		return handler.Error(fmt.Sprintf("etcd: %s", err.Error()))
	}

	failedServices := []string{}
	var wg sync.WaitGroup
	var mu sync.Mutex

	for _, node := range resp.Node.Nodes {
		var config ThriftServiceConfiguration

		err := json.Unmarshal([]byte(node.Value), &config)

		if err != nil {
			return handler.Error(fmt.Sprintf("json: %s", err.Error()))
		}

		wg.Add(1)

		go func(config ThriftServiceConfiguration, node *etcd.Node) {
			defer wg.Done()
			if !checkService(config) {
				mu.Lock()
				defer mu.Unlock()
				parts := strings.Split(node.Key, "/")
				failedServices = append(failedServices, parts[len(parts)-1])
			}
		}(config, node)
	}

	wg.Wait()

	if len(failedServices) == 0 {
		return handler.Ok("Every thrift services are alive")
	} else {
		return handler.Error(fmt.Sprintf(
			"Thrift services dead: %s",
			strings.Join(failedServices, ","),
		))
	}
}
Пример #5
0
func (c *Check) Check() check.ExtensionCheckResult {
	nodes, err := nodesInfo()

	if err != nil {
		return handler.Error(fmt.Sprintf("rabbitmq: %s", err.Error()))
	}

	warning := c.Warning

	if os.Getenv(fmt.Sprintf("%s_WARNING", strings.ToUpper(c.Type))) != "" {
		w, err := strconv.Atoi(
			os.Getenv(fmt.Sprintf("%s_WARNING", strings.ToUpper(c.Type))),
		)

		if err != nil {
			return handler.Error(fmt.Sprintf("%s warning: %s", c.Type, err.Error()))
		}

		warning = w
	}

	error, err := c.readErrorThreshold()

	if err != nil {
		return handler.Error(fmt.Sprintf("%s error: %s", c.Type, err.Error()))
	}

	nodeError := make(map[string]int)
	nodeWarning := make(map[string]int)

	for _, node := range nodes {
		if !node.IsRunning {
			continue
		}

		if c.Comp(c.Method(node), error) {
			nodeError[node.Name] = c.Method(node)
			continue
		}

		if c.Comp(c.Method(node), warning) {
			nodeWarning[node.Name] = c.Method(node)
			continue
		}
	}

	if len(nodeError) > 0 {
		return handler.Error(buildMessage(nodeError, nodeWarning))
	} else if len(nodeWarning) > 0 {
		return handler.Warning(buildMessage(nodeError, nodeWarning))
	}

	return handler.Ok("Every node are ok")
}
Пример #6
0
func UnitsStatesCheck() check.ExtensionCheckResult {
	cl, err := NewFleetClient()

	if err != nil {
		return handler.Error(err.Error())
	}

	units, err := cl.UnitStates()

	if err != nil {
		return handler.Error(err.Error())
	}

	blackListRegexp := DefaultBlacklist

	if v := os.Getenv("BLACKLIST_REGEXP"); v != "" {
		blackListRegexp = v
	}

	reg, err := regexp.Compile(blackListRegexp)

	if err != nil {
		return handler.Error(err.Error())
	}

	wrongStates := []string{}

	for _, u := range units {
		if reg.MatchString(u.Name) {
			continue
		}

		if u.SystemdActiveState == "failed" ||
			u.SystemdActiveState == "inactive" ||
			u.SystemdSubState == "dead" ||
			u.SystemdSubState == "failed" {
			wrongStates = append(wrongStates, u.Name)
		}

	}

	if len(wrongStates) == 0 {
		return handler.Ok("Every units are up and running")
	} else {
		return handler.Error(
			fmt.Sprintf(
				"Failed units: %s",
				strings.Join(wrongStates, ","),
			),
		)
	}
}
func (c *StandardCheck) Check() check.ExtensionCheckResult {
	v, err := c.Value()

	if err != nil {
		return handler.Error(err.Error())
	}

	if c.Comp(c.ErrorThreshold, v) {
		return handler.Error(c.CheckMessage(v))
	} else if c.Comp(c.WarningThreshold, v) {
		return handler.Warning(c.CheckMessage(v))
	} else {
		return handler.Ok(c.CheckMessage(v))
	}
}
Пример #8
0
func (c *Check) Check() check.ExtensionCheckResult {
	value := 0.0
	if v, err := c.fetchValue(); err != nil {
		return handler.Error(fmt.Sprintf("%s: %s", c.Name, err.Error()))
	} else {
		value = v
	}

	message := fmt.Sprintf("%s: %s", c.Name, c.displayValue(value))

	if value > c.errorThreshold {
		return handler.Error(message)
	} else if value > c.warningThreshold {
		return handler.Warning(message)
	}

	return handler.Ok(message)
}
Пример #9
0
func (c *Check) RestartCheck() check.ExtensionCheckResult {
	nodes, err := nodesInfo()

	if err != nil {
		return handler.Error(fmt.Sprintf("rabbitmq: %s", err.Error()))
	}

	errorThreshold, err := c.readErrorThreshold()

	if err != nil {
		return handler.Error(fmt.Sprintf("%s error: %s", c.Type, err.Error()))
	}

	failedNodes := []string{}

	hostsToUnits, err := nodesHostsToUnits()

	if err != nil {
		return handler.Error(fmt.Sprintf("%s error: %s", c.Type, err.Error()))
	}

	for _, node := range nodes {
		if !node.IsRunning {
			continue
		}

		if c.Comp(c.Method(node), errorThreshold) {
			if nodeUnit, ok := hostsToUnits[node.Name]; ok {
				failedNodes = append(failedNodes, nodeUnit)
			}
		}
	}

	if len(failedNodes) > 0 {
		return handler.Error(c.toRestartList(failedNodes))
	}

	return handler.Ok("No rmq node needs restart")
}
Пример #10
0
func UnitsCheck() check.ExtensionCheckResult {
	cl, err := NewFleetClient()

	if err != nil {
		return handler.Error(err.Error())
	}

	units, err := cl.Units()

	if err != nil {
		return handler.Error(err.Error())
	}

	wrongStates := []string{}

	for _, u := range units {
		if u.DesiredState != u.CurrentState || u.DesiredState == "inactive" {
			ju := job.Unit{Unit: *schema.MapSchemaUnitOptionsToUnitFile(u.Options)}

			if !ju.IsGlobal() {
				wrongStates = append(wrongStates, u.Name)
			}
		}
	}

	if len(wrongStates) == 0 {
		return handler.Ok("Every untis are in their desired states")
	} else {
		return handler.Error(
			fmt.Sprintf(
				"Units in an incoherent state: %s",
				strings.Join(wrongStates, ","),
			),
		)
	}
}
Пример #11
0
func EtcdGlobalCheck() check.ExtensionCheckResult {
	client := buildEc2Client()

	r, err := client.Instances([]string{}, nil)

	if err != nil {
		return handler.Error(fmt.Sprintf("aws: %s", err.Error()))
	}

	failedInstances := []string{}

	var mu sync.Mutex
	var wg sync.WaitGroup

	for _, reservation := range r.Reservations {
		for _, instance := range reservation.Instances {
			name := ""

			for _, tag := range instance.Tags {
				if tag.Key == "Name" {
					name = tag.Value
				}
			}

			if !strings.HasPrefix(name, "core-") {
				continue
			}

			log.Println(name)

			wg.Add(1)

			go func() {
				defer wg.Done()

				timeout := 15 * time.Second
				client := http.Client{Timeout: timeout}
				_, err := client.Get(
					fmt.Sprintf("http://%s:2379/v2/keys", instance.PrivateIpAddress),
				)

				if err != nil {
					log.Printf("%s: %s", name, err.Error())

					// Sad, https://github.com/golang/go/issues/4373
					if strings.HasSuffix(
						err.Error(),
						"use of closed network connection",
					) {
						mu.Lock()
						defer mu.Unlock()

						failedInstances = append(failedInstances, instance.InstanceId)
					}
				}
			}()
		}
	}

	wg.Wait()

	if len(failedInstances) == 0 {
		return handler.Ok("Every instances are running")
	} else {
		return handler.Error(fmt.Sprintf(
			"Instances dead: %s",
			strings.Join(failedInstances, ","),
		))
	}
}
Пример #12
0
func EtcdCheck() check.ExtensionCheckResult {
	peers := strings.Split(os.Getenv("ETCD_PEER_URLS"), ",")
	if len(peers) == 0 {
		return handler.Error("No peers provided")
	}

	client := etcd.NewClient(peers)

	if ok := client.SyncCluster(); !ok {
		return handler.Error("Cannot sync the cluster with given endpoints")
	}

	leader, stats0, err := fetchClusterStats(client.GetCluster())
	if err != nil {
		return handler.Error("Cannot reach cluster leader")
	}

	client = etcd.NewClient([]string{leader})

	resp, err := client.Get("/", false, false)
	if err != nil {
		return handler.Error("Cannot read etcd from leader")
	}

	rt0, ri0 := resp.RaftTerm, resp.RaftIndex

	time.Sleep(time.Second)

	resp, err = client.Get("/", false, false)
	if err != nil {
		return handler.Error("Cannot read etcd from leader")
	}

	rt1, ri1 := resp.RaftTerm, resp.RaftIndex

	if rt0 != rt1 {
		return handler.Error("Raft is unstable")
	}

	if ri1 == ri0 {
		return handler.Error("Raft does not make any progress")
	}

	_, stats1, err := fetchClusterStats([]string{leader})

	if err != nil {
		return handler.Error("Cannot read etcd from cluster")
	}

	unhealthy_nodes := []string{}
	for name, fs0 := range stats0.Followers {
		fs1, _ := stats1.Followers[name]
		if fs1.Counts.Success <= fs0.Counts.Success {
			unhealthy_nodes = append(unhealthy_nodes, name)
		}
	}

	if len(unhealthy_nodes) > 0 {
		handler.Error(
			fmt.Sprintf("Members %s are unhealthy",
				strings.Join(unhealthy_nodes, ",")))
	}

	return handler.Ok("All members are healthy")
}