Exemplo n.º 1
0
Arquivo: checks.go Projeto: qanx/bmad
// Called on running checks, to determine if they have finished
// running.
//
// If the Check has not finished executing, returns false.
//
// If the Check has been running for longer than its Timeout,
// a SIGTERM (and failing that a SIGKILL) is issued to forcibly
// terminate the rogue Check process. In either case, this returns
// as if the check has not yet finished, and Reap() will need to be
// called again to fully reap the Check
//
// If the Check has finished execution (on its own, or via forced
// termination), it will return true.
//
// Once complete, some additional meta-stats for the check execution
// are appended to the check output, to be submit up to bolo
func (self *Check) Reap() bool {
	pid := self.process.Process.Pid

	var ws syscall.WaitStatus
	status, err := syscall.Wait4(pid, &ws, syscall.WNOHANG, nil)
	if err != nil {
		log.Error("Error waiting on check %s[%d]: %s", self.Name, pid, err.Error())
		return false
	}
	if status == 0 {
		// self to see if we need to sigkill due to failed sigterm
		if time.Now().After(self.started_at.Add(time.Duration(self.Timeout+2) * time.Second)) {
			log.Warn("Check %s[%d] has been running too long, sending SIGKILL", self.Name, pid)
			if err := syscall.Kill(pid, syscall.SIGKILL); err != nil {
				log.Error("Error sending SIGKILL to check %s[%d]: %s", self.Name, pid, err.Error())
			}
			self.sig_kill = true
		}
		// self to see if we need to sigterm due to self timeout expiry
		if !self.sig_kill && time.Now().After(self.started_at.Add(time.Duration(self.Timeout)*time.Second)) {
			log.Warn("Check %s[%d] has been running too long, sending SIGTERM", self.Name, pid)
			if err := syscall.Kill(pid, syscall.SIGTERM); err != nil {
				log.Error("Error sending SIGTERM to check %s[%d]: %s", self.Name, pid, err.Error())
			}
			self.sig_term = true
		}
		return false
	}

	self.ended_at = time.Now()
	self.running = false
	self.duration = time.Since(self.started_at)
	self.latency = self.started_at.Sub(self.next_run)
	self.output = string(self.stdout.Bytes())
	self.err_msg = string(self.stderr.Bytes())

	if ws.Exited() {
		self.rc = ws.ExitStatus()
	} else {
		log.Debug("Check %s[%d] exited abnormally (signaled/stopped). Setting rc to UNKNOWN", self.Name, pid)
		self.rc = UNKNOWN
	}
	if self.rc > UNKNOWN {
		log.Debug("Check %s[%d] returned with an invalid exit code. Setting rc to UNKOWN", self.Name, pid)
		self.rc = UNKNOWN
	}

	self.reschedule()

	if self.ended_at.After(self.next_run) {
		timeout_triggered := "not reached"
		if self.sig_term || self.sig_kill {
			timeout_triggered = "reached"
		}
		log.Warn("Check %s[%d] took %0.3f seconds to run, at interval %d (timeout of %d was %s)",
			self.Name, pid, self.duration.Seconds(), self.Every, self.Timeout, timeout_triggered)
	}
	return true
}
Exemplo n.º 2
0
Arquivo: checks.go Projeto: qanx/bmad
// Does the needful to kick off a check. This will set the
// environment variables, pwd, effective user/group, hook
// up buffers for grabbing check output, run the process,
// and fill out accounting data for the check.
func (self *Check) Spawn() error {
	if self.running {
		return errors.New(fmt.Sprintf("check %s[%d] is already running", self.Name, self.process.Process.Pid))
	}

	process := exec.Command(self.cmd_args[0], self.cmd_args[1:]...)
	process.Env = self.environment()
	process.Dir = "/"
	var o bytes.Buffer
	var e bytes.Buffer
	process.Stdout = &o
	process.Stderr = &e
	self.output = ""
	self.err_msg = ""

	// Reset started_at as soon as possible after determining there isn't
	// a check already running. This way, if there are errors we can
	// back-off rescheduling, rather than try every tick, for relatively
	// long-term fixes (user creation, file creation/renames/permissions)
	self.started_at = time.Now()

	if self.Run_as != "" {
		u, err := user.Lookup(self.Run_as)
		if err != nil {
			return err
		}
		uid, err := strconv.ParseUint(u.Uid, 10, 32)
		if err != nil {
			return err
		}
		gid, err := strconv.ParseUint(u.Gid, 10, 32)
		if err != nil {
			return err
		}
		log.Debug("Running check %s as %q", self.Name, self.Run_as)
		process.SysProcAttr = &syscall.SysProcAttr{
			Credential: &syscall.Credential{Uid: uint32(uid), Gid: uint32(gid)},
		}
	}

	if err := process.Start(); err != nil {
		return err
	}
	log.Debug("Spawned check %s[%d]", self.Name, process.Process.Pid)

	self.running = true
	self.process = process
	self.stdout = &o
	self.stderr = &e
	self.sig_term = false
	self.sig_kill = false
	self.ended_at = time.Time{}
	self.duration = 0

	return nil
}
Exemplo n.º 3
0
Arquivo: checks.go Projeto: qanx/bmad
// Submits check results to bolo. This will append meta-stats
// to the checks as well, for bmad (like checks run, execution
// time, check latency). If the check has Bulk and Report both set
// to "true", it will report a STATE for the bulk check's execution.
// If the bulk check failed, any output to stderr will be included
// in the status message.
//
// If full_stats is set to false, the latency, and count of checks run
// will *NOT* be reported. This is primarily used internally
// for reporting stats differently for run-once mode vs daemonized.
func (self *Check) Submit(full_stats bool) error {
	// Add meta-stats for bmad
	var meta string
	var msg string
	if self.Bulk == "true" && self.Report == "true" {
		// check-specific state (for bulk data-submitter checks)
		if self.rc == OK {
			msg = self.Name + " completed successfully!"
		} else {
			msg = strings.Replace(self.err_msg, "\n", " ", -1)
		}
		meta = fmt.Sprintf("STATE %d %s:bmad:%s %d %s",
			time.Now().Unix(), cfg.Host, self.Name, self.rc, msg)
	}
	// check-specific runtime
	meta = fmt.Sprintf("%s\nSAMPLE %d %s:bmad:%s:exec-time %0.4f",
		meta, time.Now().Unix(), cfg.Host, self.Name, self.duration.Seconds())
	// bmad avg check runtime
	meta = fmt.Sprintf("%s\nSAMPLE %d %s:bmad:exec-time %0.4f",
		meta, time.Now().Unix(), cfg.Host, self.duration.Seconds())

	if full_stats {
		// bmad avg check latency
		meta = fmt.Sprintf("%s\nSAMPLE %d %s:bmad:latency %0.4f",
			meta, time.Now().Unix(), cfg.Host, self.latency.Seconds())
		// bmad overall check throughput measurement
		meta = fmt.Sprintf("%s\nCOUNTER %d %s:bmad:checks",
			meta, time.Now().Unix(), cfg.Host)
	}

	meta = meta + "\n"
	log.Debug("%s output: %s", self.Name, self.output)
	var err error
	if self.Bulk == "true" || self.attempts >= self.Retries {
		err = SendToBolo(fmt.Sprintf("%s\n%s", self.output, meta))
	} else {
		log.Debug("%s not yet at max attempts, suppressing output submission", self.Name)
		err = SendToBolo(meta)
	}
	if err != nil {
		return err
	}
	return nil
}
Exemplo n.º 4
0
// Disconnects from bolo (terminates the send_bolo process)
// If send_bolo is no longer running, does nothing.
func DisconnectFromBolo() {
	if send2bolo == nil {
		log.Warn("Bolo disconnect requested, but send_bolo is not running")
		return
	}
	pid := send2bolo.Process.Pid
	if err := syscall.Kill(pid, syscall.SIGTERM); err != nil {
		log.Debug("send_bolo[%d] already terminated", pid)
	}
	send2bolo = nil
}
Exemplo n.º 5
0
// Launches a child process to hold open a ZMQ connection
// to the upstream bolo server (send_bolo should take care
// of the configuration for how to connect). Upon termination
// this process will be respawned.
//
// Currently, if the send_bolo configuration directive for bmad
// is updated on a config reload, the send_bolo process will not
// be respawned. A full-daemon restart is required to make use
// of the new send_bolo configuration
func ConnectToBolo() error {
	args, err := shellwords.Parse(cfg.Send_bolo)
	if err != nil {
		return err
	}
	log.Debug("Spawning bolo submitter:  %#v", args)
	send2bolo = exec.Command(args[0], args[1:]...)
	r, w, err := os.Pipe()
	if err != nil {
		return err
	}
	send2bolo.Stdin = r
	writer = w
	err = send2bolo.Start()
	if err != nil {
		writer = nil
		send2bolo = nil
		return err
	}
	log.Debug("send_bolo: %#v", send2bolo)
	go func() { send2bolo.Wait(); send2bolo = nil }()
	return nil
}
Exemplo n.º 6
0
Arquivo: config.go Projeto: qanx/bmad
// Loads a YAML config file specified by cfg_file, and returns
// a Config object representing that config. Config reloads are
// auto-detected and handled seemlessly.
func LoadConfig(cfg_file string) (*Config, error) {
	new_cfg := default_config()

	source, err := ioutil.ReadFile(cfg_file)
	if err != nil {
		return cfg, err
	}

	err = goyaml.Unmarshal(source, &new_cfg)
	if err != nil {
		return cfg, err
	}

	if new_cfg.Include_dir != "" {
		log.Debug("Loading auxillary configs from %s", new_cfg.Include_dir)
		files, err := filepath.Glob(new_cfg.Include_dir + "/*.conf")
		if err != nil {
			log.Warn("Couldn't find include files: %s", err.Error())
		} else {
			for _, file := range files {
				log.Debug("Loading auxillary config: %s", file)
				source, err := ioutil.ReadFile(file)
				if err != nil {
					log.Warn("Couldn't read %q: %s", file, err.Error())
					continue
				}

				checks := map[string]*Check{}
				err = goyaml.Unmarshal(source, &checks)
				if err != nil {
					log.Warn("Could not parse yaml from %q: %s", file, err.Error())
					continue
				}

				for name, check := range checks {
					if _, exists := new_cfg.Checks[name]; exists {
						log.Warn("Check %q defined in multiple config files, ignoring definition in %s", name, file)
						continue
					}
					new_cfg.Checks[name] = check
				}
			}
		}
	}

	for name, check := range new_cfg.Checks {
		if err := initialize_check(name, check, new_cfg); err != nil {
			log.Error("Invalid check config for %s: %s (skipping)", name, err.Error())
			delete(new_cfg.Checks, name)
			continue
		}

		if cfg != nil {
			if val, ok := cfg.Checks[check.Name]; ok {
				merge_checks(check, val)
			}
		}
		log.Debug("Check %s defined as %#v", check.Name, check)
	}

	cfg = new_cfg
	log.SetupLogging(cfg.Log)
	log.Debug("Config successfully loaded as: %#v", cfg)
	return cfg, nil
}