Пример #1
0
// Called on running checks, to determine if they have finished
// running.
//
// If the Check has not finished executing, returns false.
//
// If the Check has been running for longer than its Timeout,
// a SIGTERM (and failing that a SIGKILL) is issued to forcibly
// terminate the rogue Check process. In either case, this returns
// as if the check has not yet finished, and Reap() will need to be
// called again to fully reap the Check
//
// If the Check has finished execution (on its own, or via forced
// termination), it will return true.
//
// Once complete, some additional meta-stats for the check execution
// are appended to the check output, to be submit up to bolo
func (self *Check) Reap() bool {
	pid := self.process.Process.Pid

	var ws syscall.WaitStatus
	status, err := syscall.Wait4(pid, &ws, syscall.WNOHANG, nil)
	if err != nil {
		log.Error("Error waiting on check %s[%d]: %s", self.Name, pid, err.Error())
		return false
	}
	if status == 0 {
		// self to see if we need to sigkill due to failed sigterm
		if time.Now().After(self.started_at.Add(time.Duration(self.Timeout+2) * time.Second)) {
			log.Warn("Check %s[%d] has been running too long, sending SIGKILL", self.Name, pid)
			if err := syscall.Kill(pid, syscall.SIGKILL); err != nil {
				log.Error("Error sending SIGKILL to check %s[%d]: %s", self.Name, pid, err.Error())
			}
			self.sig_kill = true
		}
		// self to see if we need to sigterm due to self timeout expiry
		if !self.sig_kill && time.Now().After(self.started_at.Add(time.Duration(self.Timeout)*time.Second)) {
			log.Warn("Check %s[%d] has been running too long, sending SIGTERM", self.Name, pid)
			if err := syscall.Kill(pid, syscall.SIGTERM); err != nil {
				log.Error("Error sending SIGTERM to check %s[%d]: %s", self.Name, pid, err.Error())
			}
			self.sig_term = true
		}
		return false
	}

	self.ended_at = time.Now()
	self.running = false
	self.duration = time.Since(self.started_at)
	self.latency = self.started_at.Sub(self.next_run)
	self.output = string(self.stdout.Bytes())
	self.err_msg = string(self.stderr.Bytes())

	if ws.Exited() {
		self.rc = ws.ExitStatus()
	} else {
		log.Debug("Check %s[%d] exited abnormally (signaled/stopped). Setting rc to UNKNOWN", self.Name, pid)
		self.rc = UNKNOWN
	}
	if self.rc > UNKNOWN {
		log.Debug("Check %s[%d] returned with an invalid exit code. Setting rc to UNKOWN", self.Name, pid)
		self.rc = UNKNOWN
	}

	self.reschedule()

	if self.ended_at.After(self.next_run) {
		timeout_triggered := "not reached"
		if self.sig_term || self.sig_kill {
			timeout_triggered = "reached"
		}
		log.Warn("Check %s[%d] took %0.3f seconds to run, at interval %d (timeout of %d was %s)",
			self.Name, pid, self.duration.Seconds(), self.Every, self.Timeout, timeout_triggered)
	}
	return true
}
Пример #2
0
// Performs heuristics to determine the hostname of the current host.
// Tries os.Hostname(), and if that isn't fully qualified (contains a '.'),
// Fails over to finding the first hostname for the first IP of the host
// that contains a '.'. If none do, fails back to the unqualified hostname.
func hostname() string {
	h, err := os_hostname()
	if err != nil {
		log.Error("Couldn't get hostname for current host: %s", err.Error())
		return "unknown"
	}
	if strings.ContainsRune(h, '.') {
		return h
	}
	addrs, err := net_lookuphost(h)
	if err != nil {
		log.Warn("Couldn't resolve FQDN of host: %s", err.Error())
		return h
	}
	if len(addrs) > 0 {
		names, err := net_lookupaddr(addrs[0])
		if err != nil {
			log.Warn("Couldn't resolve FQDN of host: %s", err.Error())
			return h
		}
		for _, name := range names {
			if strings.ContainsRune(name, '.') {
				return name
			}
		}
	}

	log.Warn("No FQDN resolvable, defaulting to unqualified hostname")
	return h
}
Пример #3
0
func (self *Check) Fail(failure error) error {
	log.Error("Error running check \"%s\": %s", self.Name, failure.Error())
	var err error
	self.rc = 3
	self.reschedule()
	if self.Report == "true" {
		if self.Bulk == "true" || self.attempts >= self.Retries {
			msg := fmt.Sprintf("STATE %d %s:bmad:%s %d %s",
				time.Now().Unix(), cfg.Host, self.Name, self.rc, "failed to exec: "+failure.Error())
			err = SendToBolo(msg)
		}
	}
	return err
}
Пример #4
0
// Loads a YAML config file specified by cfg_file, and returns
// a Config object representing that config. Config reloads are
// auto-detected and handled seemlessly.
func LoadConfig(cfg_file string) (*Config, error) {
	new_cfg := default_config()

	source, err := ioutil.ReadFile(cfg_file)
	if err != nil {
		return cfg, err
	}

	err = goyaml.Unmarshal(source, &new_cfg)
	if err != nil {
		return cfg, err
	}

	if new_cfg.Include_dir != "" {
		log.Debug("Loading auxillary configs from %s", new_cfg.Include_dir)
		files, err := filepath.Glob(new_cfg.Include_dir + "/*.conf")
		if err != nil {
			log.Warn("Couldn't find include files: %s", err.Error())
		} else {
			for _, file := range files {
				log.Debug("Loading auxillary config: %s", file)
				source, err := ioutil.ReadFile(file)
				if err != nil {
					log.Warn("Couldn't read %q: %s", file, err.Error())
					continue
				}

				checks := map[string]*Check{}
				err = goyaml.Unmarshal(source, &checks)
				if err != nil {
					log.Warn("Could not parse yaml from %q: %s", file, err.Error())
					continue
				}

				for name, check := range checks {
					if _, exists := new_cfg.Checks[name]; exists {
						log.Warn("Check %q defined in multiple config files, ignoring definition in %s", name, file)
						continue
					}
					new_cfg.Checks[name] = check
				}
			}
		}
	}

	for name, check := range new_cfg.Checks {
		if err := initialize_check(name, check, new_cfg); err != nil {
			log.Error("Invalid check config for %s: %s (skipping)", name, err.Error())
			delete(new_cfg.Checks, name)
			continue
		}

		if cfg != nil {
			if val, ok := cfg.Checks[check.Name]; ok {
				merge_checks(check, val)
			}
		}
		log.Debug("Check %s defined as %#v", check.Name, check)
	}

	cfg = new_cfg
	log.SetupLogging(cfg.Log)
	log.Debug("Config successfully loaded as: %#v", cfg)
	return cfg, nil
}