func ClusterSizeCheck() check.ExtensionCheckResult { nodes, err := nodesInfo() if err != nil { return handler.Error(fmt.Sprintf("rabbitmq: %s", err.Error())) } expected := CLUSTER_SIZE_EXPECTED if os.Getenv("CLUSTER_SIZE_EXPECTED") != "" { e, err := strconv.Atoi( os.Getenv("CLUSTER_SIZE_EXPECTED"), ) if err != nil { return handler.Error(fmt.Sprintf("cluster-size error: %s", err.Error())) } expected = e } runningNodes := 0 for _, node := range nodes { if node.IsRunning { runningNodes++ } } if runningNodes < expected { return handler.Error(fmt.Sprintf("Cluster too small: %d", runningNodes)) } return handler.Ok(fmt.Sprintf("Cluster ok: %d", runningNodes)) }
func AWSCheck() check.ExtensionCheckResult { client := buildEc2Client() r, err := client.DescribeInstanceStatus(&ec2.DescribeInstanceStatus{}, nil) if err != nil { return handler.Error(fmt.Sprintf("aws: %s", err.Error())) } failedInstances := []string{} for _, status := range r.InstanceStatus { log.Println(status.InstanceId) if status.InstanceState.Code == 16 && (status.SystemStatus.Status != "ok" || status.InstanceStatus.Status != "ok") { failedInstances = append(failedInstances, status.InstanceId) } } if len(failedInstances) == 0 { return handler.Ok("Every instances are running") } else { return handler.Error(fmt.Sprintf( "Instances dead: %s", strings.Join(failedInstances, ","), )) } }
func MachineCheck() check.ExtensionCheckResult { etcdClient := NewEtcdClient() fleetClient, err := NewFleetClient() if err != nil { return handler.Error(err.Error()) } machines, err := fleetClient.Machines() if err != nil { return handler.Error(err.Error()) } machineIDs := []string{} for _, m := range machines { machineIDs = append(machineIDs, m.ID) } r, err := etcdClient.Get("/machines", false, false) if err != nil { return handler.Error(err.Error()) } missingIDs := []string{} for _, n := range r.Node.Nodes { keySlices := strings.Split(n.Key, "/") id := keySlices[len(keySlices)-1] found := false for _, mid := range machineIDs { if id == mid { found = true } } if found { continue } h, err := etcdClient.Get( fmt.Sprintf("/machines/%s/hostname", id), false, false, ) if err == nil { missingIDs = append(missingIDs, h.Node.Value) } } if len(missingIDs) > 0 { return handler.Error( fmt.Sprintf("Misssing nodes: %s", strings.Join(missingIDs, ",")), ) } else { return handler.Ok("Every nodes are up and running") } }
func ThriftCheck() check.ExtensionCheckResult { machines := []string{} if os.Getenv("ETCD_URL") == "" { machines = append(machines, "http://172.17.42.1:2379") } else { machines = strings.Split(os.Getenv("ETCD_URL"), ",") } etcdClient := etcd.NewClient(machines) resp, err := etcdClient.Get("/sensu/services", false, true) if err != nil { return handler.Error(fmt.Sprintf("etcd: %s", err.Error())) } failedServices := []string{} var wg sync.WaitGroup var mu sync.Mutex for _, node := range resp.Node.Nodes { var config ThriftServiceConfiguration err := json.Unmarshal([]byte(node.Value), &config) if err != nil { return handler.Error(fmt.Sprintf("json: %s", err.Error())) } wg.Add(1) go func(config ThriftServiceConfiguration, node *etcd.Node) { defer wg.Done() if !checkService(config) { mu.Lock() defer mu.Unlock() parts := strings.Split(node.Key, "/") failedServices = append(failedServices, parts[len(parts)-1]) } }(config, node) } wg.Wait() if len(failedServices) == 0 { return handler.Ok("Every thrift services are alive") } else { return handler.Error(fmt.Sprintf( "Thrift services dead: %s", strings.Join(failedServices, ","), )) } }
func (c *Check) Check() check.ExtensionCheckResult { nodes, err := nodesInfo() if err != nil { return handler.Error(fmt.Sprintf("rabbitmq: %s", err.Error())) } warning := c.Warning if os.Getenv(fmt.Sprintf("%s_WARNING", strings.ToUpper(c.Type))) != "" { w, err := strconv.Atoi( os.Getenv(fmt.Sprintf("%s_WARNING", strings.ToUpper(c.Type))), ) if err != nil { return handler.Error(fmt.Sprintf("%s warning: %s", c.Type, err.Error())) } warning = w } error, err := c.readErrorThreshold() if err != nil { return handler.Error(fmt.Sprintf("%s error: %s", c.Type, err.Error())) } nodeError := make(map[string]int) nodeWarning := make(map[string]int) for _, node := range nodes { if !node.IsRunning { continue } if c.Comp(c.Method(node), error) { nodeError[node.Name] = c.Method(node) continue } if c.Comp(c.Method(node), warning) { nodeWarning[node.Name] = c.Method(node) continue } } if len(nodeError) > 0 { return handler.Error(buildMessage(nodeError, nodeWarning)) } else if len(nodeWarning) > 0 { return handler.Warning(buildMessage(nodeError, nodeWarning)) } return handler.Ok("Every node are ok") }
func UnitsStatesCheck() check.ExtensionCheckResult { cl, err := NewFleetClient() if err != nil { return handler.Error(err.Error()) } units, err := cl.UnitStates() if err != nil { return handler.Error(err.Error()) } blackListRegexp := DefaultBlacklist if v := os.Getenv("BLACKLIST_REGEXP"); v != "" { blackListRegexp = v } reg, err := regexp.Compile(blackListRegexp) if err != nil { return handler.Error(err.Error()) } wrongStates := []string{} for _, u := range units { if reg.MatchString(u.Name) { continue } if u.SystemdActiveState == "failed" || u.SystemdActiveState == "inactive" || u.SystemdSubState == "dead" || u.SystemdSubState == "failed" { wrongStates = append(wrongStates, u.Name) } } if len(wrongStates) == 0 { return handler.Ok("Every units are up and running") } else { return handler.Error( fmt.Sprintf( "Failed units: %s", strings.Join(wrongStates, ","), ), ) } }
func (c *StandardCheck) Check() check.ExtensionCheckResult { v, err := c.Value() if err != nil { return handler.Error(err.Error()) } if c.Comp(c.ErrorThreshold, v) { return handler.Error(c.CheckMessage(v)) } else if c.Comp(c.WarningThreshold, v) { return handler.Warning(c.CheckMessage(v)) } else { return handler.Ok(c.CheckMessage(v)) } }
func (c *Check) Check() check.ExtensionCheckResult { value := 0.0 if v, err := c.fetchValue(); err != nil { return handler.Error(fmt.Sprintf("%s: %s", c.Name, err.Error())) } else { value = v } message := fmt.Sprintf("%s: %s", c.Name, c.displayValue(value)) if value > c.errorThreshold { return handler.Error(message) } else if value > c.warningThreshold { return handler.Warning(message) } return handler.Ok(message) }
func (c *Check) RestartCheck() check.ExtensionCheckResult { nodes, err := nodesInfo() if err != nil { return handler.Error(fmt.Sprintf("rabbitmq: %s", err.Error())) } errorThreshold, err := c.readErrorThreshold() if err != nil { return handler.Error(fmt.Sprintf("%s error: %s", c.Type, err.Error())) } failedNodes := []string{} hostsToUnits, err := nodesHostsToUnits() if err != nil { return handler.Error(fmt.Sprintf("%s error: %s", c.Type, err.Error())) } for _, node := range nodes { if !node.IsRunning { continue } if c.Comp(c.Method(node), errorThreshold) { if nodeUnit, ok := hostsToUnits[node.Name]; ok { failedNodes = append(failedNodes, nodeUnit) } } } if len(failedNodes) > 0 { return handler.Error(c.toRestartList(failedNodes)) } return handler.Ok("No rmq node needs restart") }
func UnitsCheck() check.ExtensionCheckResult { cl, err := NewFleetClient() if err != nil { return handler.Error(err.Error()) } units, err := cl.Units() if err != nil { return handler.Error(err.Error()) } wrongStates := []string{} for _, u := range units { if u.DesiredState != u.CurrentState || u.DesiredState == "inactive" { ju := job.Unit{Unit: *schema.MapSchemaUnitOptionsToUnitFile(u.Options)} if !ju.IsGlobal() { wrongStates = append(wrongStates, u.Name) } } } if len(wrongStates) == 0 { return handler.Ok("Every untis are in their desired states") } else { return handler.Error( fmt.Sprintf( "Units in an incoherent state: %s", strings.Join(wrongStates, ","), ), ) } }
func EtcdGlobalCheck() check.ExtensionCheckResult { client := buildEc2Client() r, err := client.Instances([]string{}, nil) if err != nil { return handler.Error(fmt.Sprintf("aws: %s", err.Error())) } failedInstances := []string{} var mu sync.Mutex var wg sync.WaitGroup for _, reservation := range r.Reservations { for _, instance := range reservation.Instances { name := "" for _, tag := range instance.Tags { if tag.Key == "Name" { name = tag.Value } } if !strings.HasPrefix(name, "core-") { continue } log.Println(name) wg.Add(1) go func() { defer wg.Done() timeout := 15 * time.Second client := http.Client{Timeout: timeout} _, err := client.Get( fmt.Sprintf("http://%s:2379/v2/keys", instance.PrivateIpAddress), ) if err != nil { log.Printf("%s: %s", name, err.Error()) // Sad, https://github.com/golang/go/issues/4373 if strings.HasSuffix( err.Error(), "use of closed network connection", ) { mu.Lock() defer mu.Unlock() failedInstances = append(failedInstances, instance.InstanceId) } } }() } } wg.Wait() if len(failedInstances) == 0 { return handler.Ok("Every instances are running") } else { return handler.Error(fmt.Sprintf( "Instances dead: %s", strings.Join(failedInstances, ","), )) } }
func EtcdCheck() check.ExtensionCheckResult { peers := strings.Split(os.Getenv("ETCD_PEER_URLS"), ",") if len(peers) == 0 { return handler.Error("No peers provided") } client := etcd.NewClient(peers) if ok := client.SyncCluster(); !ok { return handler.Error("Cannot sync the cluster with given endpoints") } leader, stats0, err := fetchClusterStats(client.GetCluster()) if err != nil { return handler.Error("Cannot reach cluster leader") } client = etcd.NewClient([]string{leader}) resp, err := client.Get("/", false, false) if err != nil { return handler.Error("Cannot read etcd from leader") } rt0, ri0 := resp.RaftTerm, resp.RaftIndex time.Sleep(time.Second) resp, err = client.Get("/", false, false) if err != nil { return handler.Error("Cannot read etcd from leader") } rt1, ri1 := resp.RaftTerm, resp.RaftIndex if rt0 != rt1 { return handler.Error("Raft is unstable") } if ri1 == ri0 { return handler.Error("Raft does not make any progress") } _, stats1, err := fetchClusterStats([]string{leader}) if err != nil { return handler.Error("Cannot read etcd from cluster") } unhealthy_nodes := []string{} for name, fs0 := range stats0.Followers { fs1, _ := stats1.Followers[name] if fs1.Counts.Success <= fs0.Counts.Success { unhealthy_nodes = append(unhealthy_nodes, name) } } if len(unhealthy_nodes) > 0 { handler.Error( fmt.Sprintf("Members %s are unhealthy", strings.Join(unhealthy_nodes, ","))) } return handler.Ok("All members are healthy") }