// Called on running checks, to determine if they have finished // running. // // If the Check has not finished executing, returns false. // // If the Check has been running for longer than its Timeout, // a SIGTERM (and failing that a SIGKILL) is issued to forcibly // terminate the rogue Check process. In either case, this returns // as if the check has not yet finished, and Reap() will need to be // called again to fully reap the Check // // If the Check has finished execution (on its own, or via forced // termination), it will return true. // // Once complete, some additional meta-stats for the check execution // are appended to the check output, to be submit up to bolo func (self *Check) Reap() bool { pid := self.process.Process.Pid var ws syscall.WaitStatus status, err := syscall.Wait4(pid, &ws, syscall.WNOHANG, nil) if err != nil { log.Error("Error waiting on check %s[%d]: %s", self.Name, pid, err.Error()) return false } if status == 0 { // self to see if we need to sigkill due to failed sigterm if time.Now().After(self.started_at.Add(time.Duration(self.Timeout+2) * time.Second)) { log.Warn("Check %s[%d] has been running too long, sending SIGKILL", self.Name, pid) if err := syscall.Kill(pid, syscall.SIGKILL); err != nil { log.Error("Error sending SIGKILL to check %s[%d]: %s", self.Name, pid, err.Error()) } self.sig_kill = true } // self to see if we need to sigterm due to self timeout expiry if !self.sig_kill && time.Now().After(self.started_at.Add(time.Duration(self.Timeout)*time.Second)) { log.Warn("Check %s[%d] has been running too long, sending SIGTERM", self.Name, pid) if err := syscall.Kill(pid, syscall.SIGTERM); err != nil { log.Error("Error sending SIGTERM to check %s[%d]: %s", self.Name, pid, err.Error()) } self.sig_term = true } return false } self.ended_at = time.Now() self.running = false self.duration = time.Since(self.started_at) self.latency = self.started_at.Sub(self.next_run) self.output = string(self.stdout.Bytes()) self.err_msg = string(self.stderr.Bytes()) if ws.Exited() { self.rc = ws.ExitStatus() } else { log.Debug("Check %s[%d] exited abnormally (signaled/stopped). Setting rc to UNKNOWN", self.Name, pid) self.rc = UNKNOWN } if self.rc > UNKNOWN { log.Debug("Check %s[%d] returned with an invalid exit code. Setting rc to UNKOWN", self.Name, pid) self.rc = UNKNOWN } self.reschedule() if self.ended_at.After(self.next_run) { timeout_triggered := "not reached" if self.sig_term || self.sig_kill { timeout_triggered = "reached" } log.Warn("Check %s[%d] took %0.3f seconds to run, at interval %d (timeout of %d was %s)", self.Name, pid, self.duration.Seconds(), self.Every, self.Timeout, timeout_triggered) } return true }
// Does the needful to kick off a check. This will set the // environment variables, pwd, effective user/group, hook // up buffers for grabbing check output, run the process, // and fill out accounting data for the check. func (self *Check) Spawn() error { if self.running { return errors.New(fmt.Sprintf("check %s[%d] is already running", self.Name, self.process.Process.Pid)) } process := exec.Command(self.cmd_args[0], self.cmd_args[1:]...) process.Env = self.environment() process.Dir = "/" var o bytes.Buffer var e bytes.Buffer process.Stdout = &o process.Stderr = &e self.output = "" self.err_msg = "" // Reset started_at as soon as possible after determining there isn't // a check already running. This way, if there are errors we can // back-off rescheduling, rather than try every tick, for relatively // long-term fixes (user creation, file creation/renames/permissions) self.started_at = time.Now() if self.Run_as != "" { u, err := user.Lookup(self.Run_as) if err != nil { return err } uid, err := strconv.ParseUint(u.Uid, 10, 32) if err != nil { return err } gid, err := strconv.ParseUint(u.Gid, 10, 32) if err != nil { return err } log.Debug("Running check %s as %q", self.Name, self.Run_as) process.SysProcAttr = &syscall.SysProcAttr{ Credential: &syscall.Credential{Uid: uint32(uid), Gid: uint32(gid)}, } } if err := process.Start(); err != nil { return err } log.Debug("Spawned check %s[%d]", self.Name, process.Process.Pid) self.running = true self.process = process self.stdout = &o self.stderr = &e self.sig_term = false self.sig_kill = false self.ended_at = time.Time{} self.duration = 0 return nil }
// Submits check results to bolo. This will append meta-stats // to the checks as well, for bmad (like checks run, execution // time, check latency). If the check has Bulk and Report both set // to "true", it will report a STATE for the bulk check's execution. // If the bulk check failed, any output to stderr will be included // in the status message. // // If full_stats is set to false, the latency, and count of checks run // will *NOT* be reported. This is primarily used internally // for reporting stats differently for run-once mode vs daemonized. func (self *Check) Submit(full_stats bool) error { // Add meta-stats for bmad var meta string var msg string if self.Bulk == "true" && self.Report == "true" { // check-specific state (for bulk data-submitter checks) if self.rc == OK { msg = self.Name + " completed successfully!" } else { msg = strings.Replace(self.err_msg, "\n", " ", -1) } meta = fmt.Sprintf("STATE %d %s:bmad:%s %d %s", time.Now().Unix(), cfg.Host, self.Name, self.rc, msg) } // check-specific runtime meta = fmt.Sprintf("%s\nSAMPLE %d %s:bmad:%s:exec-time %0.4f", meta, time.Now().Unix(), cfg.Host, self.Name, self.duration.Seconds()) // bmad avg check runtime meta = fmt.Sprintf("%s\nSAMPLE %d %s:bmad:exec-time %0.4f", meta, time.Now().Unix(), cfg.Host, self.duration.Seconds()) if full_stats { // bmad avg check latency meta = fmt.Sprintf("%s\nSAMPLE %d %s:bmad:latency %0.4f", meta, time.Now().Unix(), cfg.Host, self.latency.Seconds()) // bmad overall check throughput measurement meta = fmt.Sprintf("%s\nCOUNTER %d %s:bmad:checks", meta, time.Now().Unix(), cfg.Host) } meta = meta + "\n" log.Debug("%s output: %s", self.Name, self.output) var err error if self.Bulk == "true" || self.attempts >= self.Retries { err = SendToBolo(fmt.Sprintf("%s\n%s", self.output, meta)) } else { log.Debug("%s not yet at max attempts, suppressing output submission", self.Name) err = SendToBolo(meta) } if err != nil { return err } return nil }
// Disconnects from bolo (terminates the send_bolo process) // If send_bolo is no longer running, does nothing. func DisconnectFromBolo() { if send2bolo == nil { log.Warn("Bolo disconnect requested, but send_bolo is not running") return } pid := send2bolo.Process.Pid if err := syscall.Kill(pid, syscall.SIGTERM); err != nil { log.Debug("send_bolo[%d] already terminated", pid) } send2bolo = nil }
// Launches a child process to hold open a ZMQ connection // to the upstream bolo server (send_bolo should take care // of the configuration for how to connect). Upon termination // this process will be respawned. // // Currently, if the send_bolo configuration directive for bmad // is updated on a config reload, the send_bolo process will not // be respawned. A full-daemon restart is required to make use // of the new send_bolo configuration func ConnectToBolo() error { args, err := shellwords.Parse(cfg.Send_bolo) if err != nil { return err } log.Debug("Spawning bolo submitter: %#v", args) send2bolo = exec.Command(args[0], args[1:]...) r, w, err := os.Pipe() if err != nil { return err } send2bolo.Stdin = r writer = w err = send2bolo.Start() if err != nil { writer = nil send2bolo = nil return err } log.Debug("send_bolo: %#v", send2bolo) go func() { send2bolo.Wait(); send2bolo = nil }() return nil }
// Loads a YAML config file specified by cfg_file, and returns // a Config object representing that config. Config reloads are // auto-detected and handled seemlessly. func LoadConfig(cfg_file string) (*Config, error) { new_cfg := default_config() source, err := ioutil.ReadFile(cfg_file) if err != nil { return cfg, err } err = goyaml.Unmarshal(source, &new_cfg) if err != nil { return cfg, err } if new_cfg.Include_dir != "" { log.Debug("Loading auxillary configs from %s", new_cfg.Include_dir) files, err := filepath.Glob(new_cfg.Include_dir + "/*.conf") if err != nil { log.Warn("Couldn't find include files: %s", err.Error()) } else { for _, file := range files { log.Debug("Loading auxillary config: %s", file) source, err := ioutil.ReadFile(file) if err != nil { log.Warn("Couldn't read %q: %s", file, err.Error()) continue } checks := map[string]*Check{} err = goyaml.Unmarshal(source, &checks) if err != nil { log.Warn("Could not parse yaml from %q: %s", file, err.Error()) continue } for name, check := range checks { if _, exists := new_cfg.Checks[name]; exists { log.Warn("Check %q defined in multiple config files, ignoring definition in %s", name, file) continue } new_cfg.Checks[name] = check } } } } for name, check := range new_cfg.Checks { if err := initialize_check(name, check, new_cfg); err != nil { log.Error("Invalid check config for %s: %s (skipping)", name, err.Error()) delete(new_cfg.Checks, name) continue } if cfg != nil { if val, ok := cfg.Checks[check.Name]; ok { merge_checks(check, val) } } log.Debug("Check %s defined as %#v", check.Name, check) } cfg = new_cfg log.SetupLogging(cfg.Log) log.Debug("Config successfully loaded as: %#v", cfg) return cfg, nil }