// Called on running checks, to determine if they have finished // running. // // If the Check has not finished executing, returns false. // // If the Check has been running for longer than its Timeout, // a SIGTERM (and failing that a SIGKILL) is issued to forcibly // terminate the rogue Check process. In either case, this returns // as if the check has not yet finished, and Reap() will need to be // called again to fully reap the Check // // If the Check has finished execution (on its own, or via forced // termination), it will return true. // // Once complete, some additional meta-stats for the check execution // are appended to the check output, to be submit up to bolo func (self *Check) Reap() bool { pid := self.process.Process.Pid var ws syscall.WaitStatus status, err := syscall.Wait4(pid, &ws, syscall.WNOHANG, nil) if err != nil { log.Error("Error waiting on check %s[%d]: %s", self.Name, pid, err.Error()) return false } if status == 0 { // self to see if we need to sigkill due to failed sigterm if time.Now().After(self.started_at.Add(time.Duration(self.Timeout+2) * time.Second)) { log.Warn("Check %s[%d] has been running too long, sending SIGKILL", self.Name, pid) if err := syscall.Kill(pid, syscall.SIGKILL); err != nil { log.Error("Error sending SIGKILL to check %s[%d]: %s", self.Name, pid, err.Error()) } self.sig_kill = true } // self to see if we need to sigterm due to self timeout expiry if !self.sig_kill && time.Now().After(self.started_at.Add(time.Duration(self.Timeout)*time.Second)) { log.Warn("Check %s[%d] has been running too long, sending SIGTERM", self.Name, pid) if err := syscall.Kill(pid, syscall.SIGTERM); err != nil { log.Error("Error sending SIGTERM to check %s[%d]: %s", self.Name, pid, err.Error()) } self.sig_term = true } return false } self.ended_at = time.Now() self.running = false self.duration = time.Since(self.started_at) self.latency = self.started_at.Sub(self.next_run) self.output = string(self.stdout.Bytes()) self.err_msg = string(self.stderr.Bytes()) if ws.Exited() { self.rc = ws.ExitStatus() } else { log.Debug("Check %s[%d] exited abnormally (signaled/stopped). Setting rc to UNKNOWN", self.Name, pid) self.rc = UNKNOWN } if self.rc > UNKNOWN { log.Debug("Check %s[%d] returned with an invalid exit code. Setting rc to UNKOWN", self.Name, pid) self.rc = UNKNOWN } self.reschedule() if self.ended_at.After(self.next_run) { timeout_triggered := "not reached" if self.sig_term || self.sig_kill { timeout_triggered = "reached" } log.Warn("Check %s[%d] took %0.3f seconds to run, at interval %d (timeout of %d was %s)", self.Name, pid, self.duration.Seconds(), self.Every, self.Timeout, timeout_triggered) } return true }
// Performs heuristics to determine the hostname of the current host. // Tries os.Hostname(), and if that isn't fully qualified (contains a '.'), // Fails over to finding the first hostname for the first IP of the host // that contains a '.'. If none do, fails back to the unqualified hostname. func hostname() string { h, err := os_hostname() if err != nil { log.Error("Couldn't get hostname for current host: %s", err.Error()) return "unknown" } if strings.ContainsRune(h, '.') { return h } addrs, err := net_lookuphost(h) if err != nil { log.Warn("Couldn't resolve FQDN of host: %s", err.Error()) return h } if len(addrs) > 0 { names, err := net_lookupaddr(addrs[0]) if err != nil { log.Warn("Couldn't resolve FQDN of host: %s", err.Error()) return h } for _, name := range names { if strings.ContainsRune(name, '.') { return name } } } log.Warn("No FQDN resolvable, defaulting to unqualified hostname") return h }
func (self *Check) Fail(failure error) error { log.Error("Error running check \"%s\": %s", self.Name, failure.Error()) var err error self.rc = 3 self.reschedule() if self.Report == "true" { if self.Bulk == "true" || self.attempts >= self.Retries { msg := fmt.Sprintf("STATE %d %s:bmad:%s %d %s", time.Now().Unix(), cfg.Host, self.Name, self.rc, "failed to exec: "+failure.Error()) err = SendToBolo(msg) } } return err }
// Loads a YAML config file specified by cfg_file, and returns // a Config object representing that config. Config reloads are // auto-detected and handled seemlessly. func LoadConfig(cfg_file string) (*Config, error) { new_cfg := default_config() source, err := ioutil.ReadFile(cfg_file) if err != nil { return cfg, err } err = goyaml.Unmarshal(source, &new_cfg) if err != nil { return cfg, err } if new_cfg.Include_dir != "" { log.Debug("Loading auxillary configs from %s", new_cfg.Include_dir) files, err := filepath.Glob(new_cfg.Include_dir + "/*.conf") if err != nil { log.Warn("Couldn't find include files: %s", err.Error()) } else { for _, file := range files { log.Debug("Loading auxillary config: %s", file) source, err := ioutil.ReadFile(file) if err != nil { log.Warn("Couldn't read %q: %s", file, err.Error()) continue } checks := map[string]*Check{} err = goyaml.Unmarshal(source, &checks) if err != nil { log.Warn("Could not parse yaml from %q: %s", file, err.Error()) continue } for name, check := range checks { if _, exists := new_cfg.Checks[name]; exists { log.Warn("Check %q defined in multiple config files, ignoring definition in %s", name, file) continue } new_cfg.Checks[name] = check } } } } for name, check := range new_cfg.Checks { if err := initialize_check(name, check, new_cfg); err != nil { log.Error("Invalid check config for %s: %s (skipping)", name, err.Error()) delete(new_cfg.Checks, name) continue } if cfg != nil { if val, ok := cfg.Checks[check.Name]; ok { merge_checks(check, val) } } log.Debug("Check %s defined as %#v", check.Name, check) } cfg = new_cfg log.SetupLogging(cfg.Log) log.Debug("Config successfully loaded as: %#v", cfg) return cfg, nil }