func (i *Initd) LookupService(serviceName string) (*ProcessStatus, error) { path := i.ctlPath + serviceName result, _ := util.FileExists(path) if !result { // service script does not exist in etc/init.d, not under // init.d control return nil, &ServiceError{i.Name(), serviceName, ErrServiceNotFound} } // First try to find the PID file with same name in /var/run. paths := []string{ i.varrunPath + serviceName + ".pid", i.varrunPath + serviceName + "/" + serviceName + ".pid", } for _, pidpath := range paths { st, err := i.readPidFile(pidpath) if err != nil { util.Info("Error processing PID file %s: %s", pidpath, err.Error()) continue } else if st != nil { return st, nil } else { util.Info("No such pidfile %s", pidpath) } } return &ProcessStatus{0, Down}, nil }
func detectInitd(root string) (InitSystem, error) { ctlpath := root + "etc/init.d/" result, err := util.FileExists(ctlpath) if err != nil { return nil, err } if !result { util.Debug("init.d not detected in " + ctlpath) return nil, nil } matches, err := filepath.Glob(ctlpath + "*") if err != nil { return nil, err } if !result { util.Debug("init.d not detected in " + ctlpath) return nil, nil } if len(matches) > 0 { util.Info("Detected init.d in " + ctlpath) return &Initd{ctlpath, root + "var/run/", pidForString}, nil } util.Info(ctlpath + " exists but appears to be empty") return nil, nil }
func (ps *processStorage) AddSource(name string, config map[string]string) (Source, error) { for _, x := range ps.daemonSpecific { if x.Name() == name { return x, nil } } builder := Sources[name] if builder == nil { return nil, nil } util.Info("Activating metrics for %s", name) src, err := builder(config) if err != nil { return nil, err } m, ok := src.(MandatorySource) if ok && m.Mandatory() { util.Debug("Registering all metrics for %s", name) descs := src.ValidMetrics() for _, d := range descs { if d.MetricType == Counter { ps.DeclareCounter(name, d.Name, nil, d.Display) } else { ps.DeclareGauge(name, d.Name, d.Display) } } } ps.daemonSpecific = append(ps.daemonSpecific, src) return src, nil }
func reload(i *Inspeqtor) { util.Info(Name + " reloading") newi, err := New(i.RootDir, i.SocketPath) if err != nil { util.Warn("Unable to reload: %s", err.Error()) return } err = newi.Parse() if err != nil { util.Warn("Unable to reload: %s", err.Error()) return } // we're reloading and newcopy will become the new // singleton. Pro hooks into this to reload its features too. for _, callback := range Reloaders { err := callback(i, newi) if err != nil { util.Warn("Unable to reload: %s", err.Error()) return } } // TODO proper reloading would not throw away the existing metric data // in i but defining new metrics can change the storage tree. Implement // deep metric tree ring buffer sync if possible in basicReloader? i.Shutdown() newi.Start() }
func main() { cli.SetupLogging() options := cli.ParseArguments() ins, err := inspeqtor.New(options.ConfigDirectory, options.SocketPath) if err != nil { log.Fatalln(err) } err = ins.Parse() if err != nil { log.Fatalln(err) } if options.TestConfig { util.Info("Configuration parsed ok.") os.Exit(0) } else if options.TestAlertRoutes { ins.TestAlertRoutes() } else { // Fire up the Inspeqtor singleton ins.Start() // Install the global signal handlers // This method never returns. inspeqtor.HandleSignals() } }
func check(jobs map[string]*Job) time.Duration { min := time.Hour for _, j := range jobs { now := time.Now() due := j.LastRun.Add(j.Interval) if due.After(now) && min > due.Sub(now) { // calculate the delay time until the next job check min = due.Sub(now) } if due.Before(now) && j.state == inspeqtor.Ok { util.Warn("Recurring job \"%s\" is overdue", j.JobName) j.state = inspeqtor.Triggered err := j.alert(JobOverdue) if err != nil { util.Warn(fmt.Sprintf("Error firing cron job alert: %s", err.Error())) } } if !due.Before(now) && j.state == inspeqtor.Triggered { util.Info("Recurring job \"%s\" has recovered", j.JobName) err := j.alert(JobRan) if err != nil { util.Warn(fmt.Sprintf("Error firing cron job alert: %s", err.Error())) } j.state = inspeqtor.Ok } } return min }
/* Resolve each defined service to its managing init system. Called only at startup, this is what maps services to init and fires ProcessDoesNotExist events. */ func (svc *Service) Resolve(mgrs []services.InitSystem) error { for _, sm := range mgrs { // TODO There's a bizarre race condition here. Figure out // why this is necessary. We shouldn't be multi-threaded yet. if sm == nil { continue } ps, err := sm.LookupService(svc.Name()) if err != nil { serr := err.(*services.ServiceError) if serr.Err == services.ErrServiceNotFound { util.Debug(sm.Name() + " doesn't have " + svc.Name()) continue } return err } util.Info("Found %s/%s with status %s", sm.Name(), svc.Name(), ps) svc.Manager = sm svc.Transition(ps, func(et EventType) { counters.Add("events", 1) err = svc.EventHandler.Trigger(&Event{et, svc, nil}) if err != nil { util.Warn("Error firing event: %s", err.Error()) } }) break } if svc.Manager == nil { return fmt.Errorf("Could not find service %s, did you misspell it?", svc.Name()) } return nil }
func startDeploy(i *Inspeqtor, args []string, resp io.Writer) { length := time.Duration(i.GlobalConfig.DeployLength) * time.Second i.SilenceUntil = time.Now().Add(length) counters.Get("deploy").(*expvar.Int).Set(1) util.Info("Starting deploy") io.WriteString(resp, "Starting deploy, now silenced\n") }
func triggeredHandler(rule *Rule, tripped bool) *Event { if !tripped { util.Info("%s[%s] recovered.", rule.EntityName(), rule.Metric()) rule.State = Recovered return nil } util.Debug("%s[%s] still triggered. Current: %.1f, Threshold: %.1f", rule.EntityName(), rule.Metric(), rule.CurrentValue, rule.Threshold) return nil }
func finishDeploy(i *Inspeqtor, args []string, resp io.Writer) { // silence for a cycle, give processes a little time to // settle before alerting again. We don't want a restart // during a deploy to send email for those events. i.SilenceUntil = time.Now().Add(time.Duration(i.GlobalConfig.CycleTime) * time.Second) counters.Get("deploy").(*expvar.Int).Set(0) util.Info("Finished deploy") io.WriteString(resp, "Finished deploy, volume turned to 11\n") }
func main() { inspeqtor.Name = "Inspeqtor Pro" cli.StartupInfo = func() { } cli.SetupLogging() options := cli.ParseArguments() _, err := verifyLicense(options.ConfigDirectory) if err != nil { util.Warn("Error verifying license file: %s", err) os.Exit(127) } ins, err := inspeqtor.New(options.ConfigDirectory, options.SocketPath) if err != nil { log.Fatalln(err) } err = ins.Parse() if err != nil { log.Fatalln(err) } err = bootstrapJobs(ins, options.ConfigDirectory) if err != nil { log.Fatalln(err) } err = bootstrapStatsd(ins, options.ConfigDirectory) if err != nil { log.Fatalln(err) } err = expose.Bootstrap(ins) if err != nil { log.Fatalln(err) } if options.TestConfig { util.Info("Configuration parsed ok.") os.Exit(0) } else if options.TestAlertRoutes { ins.TestAlertRoutes() } else { ins.Start() inspeqtor.HandleSignals() } }
func Parse(global *inspeqtor.ConfigFile, confDir string) (map[string]*Job, error) { inspeqtor.CommandHandlers["job_done"] = jobDone parsedJobs, err := parseJobs(global, confDir) if err != nil { return nil, err } if len(parsedJobs) == 0 { return nil, nil } jobs = parsedJobs util.Info("Watching for %d recurring jobs", len(parsedJobs)) return parsedJobs, nil }
func ParseGlobal(rootDir string) (*ConfigFile, error) { path := rootDir + "/inspeqtor.conf" exists, err := util.FileExists(path) if err != nil { return nil, err } if exists { util.Debug("Parsing " + path) data, err := ioutil.ReadFile(path) if err != nil { return nil, err } s := lexer.NewLexer([]byte(data)) p := parser.NewParser() obj, err := p.Parse(s) if err != nil { return nil, err } ast := obj.(ast.Config) config := ConfigFile{Defaults, map[string]*AlertRoute{}} config.Variables = ast.Variables if val, has := ast.Variables["log_level"]; has { util.SetLogLevel(val) } parseValue(ast, &config.CycleTime, "cycle_time", 15) parseValue(ast, &config.DeployLength, "deploy_length", 300) parseValue(ast, &config.ExposePort, "expose_port", 4677) for _, v := range ast.Routes { ar, err := ValidateChannel(v.Name, v.Channel, v.Config) if err != nil { return nil, err } if _, ok := config.AlertRoutes[v.Name]; ok { return nil, fmt.Errorf("Duplicate alert config for '%s'", v.Name) } config.AlertRoutes[v.Name] = ar } return &config, nil } util.Info("No configuration file found at " + rootDir + "/inspector.conf") return &ConfigFile{Defaults, nil}, nil }
func detectLaunchd(rootDir string) (InitSystem, error) { if !util.Darwin() { return nil, nil } util.Info("Detected OSX, using launchd") usr, err := user.Current() if err != nil { return nil, err } dir := usr.HomeDir paths := []string{ dir + "/Library/LaunchAgents", "/Library/LaunchAgents", "/Library/LaunchDaemons", "/System/Library/LaunchDaemons", } return &Launchd{paths}, nil }
func (i *Inspeqtor) Start() { util.Debug("Starting command socket") err := i.openSocket(i.SocketPath) if err != nil { util.Warn("Could not create Unix socket: %s", err.Error()) exit(i) } go func() { for { if !i.safelyAccept() { util.Debug("Shutting down command socket") return } } }() // if expose_port is 0, disable the feature altogether if i.GlobalConfig.ExposePort != 0 { sock, err := net.Listen("tcp", fmt.Sprintf("localhost:%d", i.GlobalConfig.ExposePort)) if err != nil { util.Warn("Could not listen on port %d: %s", i.GlobalConfig.ExposePort, err.Error()) exit(i) } i.Expose = sock go func() { // TODO How do we error handling here? util.Info("Expose now available at port %d", i.GlobalConfig.ExposePort) err := http.Serve(i.Expose, nil) // Don't log an "error" when we shut down normally and close the socket if err != nil && !strings.Contains(err.Error(), "use of closed network") { util.Warn("HTTP server error: %s", err.Error()) } }() } util.Debug("Starting main run loop") go i.runLoop() Singleton = i }
func statsdReload(_ *inspeqtor.Inspeqtor, newi *inspeqtor.Inspeqtor) error { val, ok := newi.GlobalConfig.Variables["statsd_location"] if !ok { util.Debug("No statsd_location configured, skipping...") return nil } util.Info("Pushing metrics to statsd at %s", val) conn, err := statsd.Dial(val) if err != nil { return err } newi.Listen("cycleComplete", func(ins *inspeqtor.Inspeqtor) error { return statsd.Export(conn, ins) }) newi.Listen("shutdown", func(ins *inspeqtor.Inspeqtor) error { return conn.Close() }) return nil }
func (hs *hostStorage) collectCPU() error { ok, err := util.FileExists(hs.path + "/stat") if err != nil { return err } if ok { contents, err := ioutil.ReadFile(hs.path + "/stat") if err != nil { return err } lines := strings.Split(string(contents), "\n") line := lines[0] fields := strings.Fields(line) user, _ := strconv.ParseInt(fields[1], 10, 64) nice, _ := strconv.ParseInt(fields[2], 10, 64) system, _ := strconv.ParseInt(fields[3], 10, 64) iowait, _ := strconv.ParseInt(fields[5], 10, 64) irq, _ := strconv.ParseInt(fields[6], 10, 64) softIrq, _ := strconv.ParseInt(fields[7], 10, 64) steal, _ := strconv.ParseInt(fields[8], 10, 64) total := user + nice + system + iowait + irq + softIrq + steal // These are the five I can envision writing rules against. // Open an issue if you want access to the other values. hs.Save("cpu", "", float64(total)) hs.Save("cpu", "user", float64(user)) hs.Save("cpu", "system", float64(system)) hs.Save("cpu", "iowait", float64(iowait)) hs.Save("cpu", "steal", float64(steal)) } else { // TODO util.Info("Cannot collect host CPU metrics, not implemented on this platform") } return nil }
func (rs *nginxSource) runCli() (metrics.Map, error) { sout, err := rs.client(rs.Hostname, rs.Port, rs.Endpoint) if err != nil { return nil, err } if sout[0] != 0x41 { // first char should be 'A' util.Warn(string(sout)) return nil, errors.New("Unknown nginx status output") } values := map[string]float64{} results := digits.FindAllStringSubmatch(string(sout), 7) if results == nil || len(results) != 7 { return nil, errors.New("Unknown nginx input") } for idx, met := range nginxMetrics { if !rs.metrics[met.Name] { continue } val, err := strconv.ParseInt(results[idx][0], 10, 64) if err != nil { return nil, err } values[met.Name] = float64(val) } if len(rs.metrics) > len(values) { for k := range rs.metrics { if _, ok := values[k]; !ok { util.Info("Could not find metric %s(%s), did you spell it right?", rs.Name(), k) } } } return values, nil }
func (i *Inspeqtor) acceptCommand() bool { c, err := i.Socket.Accept() if err != nil { select { case <-i.Stopping: // we're stopping or reloading, no big deal... default: util.Warn("%v", err) } return false } defer c.Close() c.SetDeadline(time.Now().Add(2 * time.Second)) reader := bufio.NewReader(c) line, err := reader.ReadString('\n') if err != nil { util.Info("Did not receive command line in time: %s", err.Error()) return true } fields := strings.Fields(line) if len(fields) == 0 { showHelp(i, []string{}, c) return true } funk := CommandHandlers[fields[0]] if funk == nil { util.Warn("Unknown command: %s", strings.TrimSpace(line)) io.WriteString(c, "Unknown command: "+line) return true } funk(i, fields[1:], c) return true }
func (i *Inspeqtor) TestAlertRoutes() int { bad := 0 util.Info("Testing alert routes") for _, route := range i.GlobalConfig.AlertRoutes { nm := route.Name if nm == "" { nm = "default" } util.Debug("Creating notification for %s/%s", route.Channel, nm) notifier, err := Actions["alert"](i.Host, route) if err != nil { bad++ util.Warn("Error creating %s/%s route: %s", route.Channel, nm, err.Error()) continue } util.Debug("Triggering notification for %s/%s", route.Channel, nm) err = notifier.Trigger(&Event{RuleFailed, i.Host, i.Host.Rules()[0]}) if err != nil { bad++ util.Warn("Error firing %s/%s route: %s", route.Channel, nm, err.Error()) } } return bad }
func (rs *memcachedSource) runCli(funk executor) (metrics.Map, error) { sout, err := funk("nc", []string{rs.Hostname, rs.Port}, []byte("stats\n")) if err != nil { return nil, err } lines, err := util.ReadLines(sout) if err != nil { return nil, err } values := map[string]float64{} for _, line := range lines { if line == "" || line[0] != 'S' { continue } parts := strings.Fields(line) if rs.metrics[parts[1]] { val, err := strconv.ParseFloat(parts[2], 64) if err != nil { return nil, errors.New("Invalid metric input for '" + line + "': " + err.Error()) } values[parts[1]] = val } } if len(rs.metrics) > len(values) { for k := range rs.metrics { if _, ok := values[k]; !ok { util.Info("Could not find metric %s(%s), did you spell it right?", rs.Name(), k) } } } return values, nil }
func detectUpstart(path string) (InitSystem, error) { result, err := util.FileExists(path) if err != nil { return nil, err } if !result { util.Debug("upstart not detected, no " + path) return nil, nil } matches, err := filepath.Glob(path + "/*.conf") if err != nil { return nil, err } if len(matches) > 0 { util.Info("Detected upstart in " + path) return &Upstart{path, nil}, nil } util.Debug("upstart not detected, empty " + path) return nil, nil }
func detectSystemd(path string) (InitSystem, error) { result, err := util.FileExists(path) if err != nil { return nil, err } if !result { util.Debug("systemd not detected, no " + path) return nil, nil } matches, err := filepath.Glob(path + "/*.conf") if err != nil { return nil, err } if len(matches) > 0 { util.Info("Detected systemd in " + path) return &Systemd{path, "", ""}, nil } util.Debug("systemd not detected, empty " + path) return nil, nil }
func exit(i *Inspeqtor) { util.Info(Name + " exiting") i.Shutdown() os.Exit(0) }
/* Called for each service each cycle, in parallel. This method must be thread-safe. Since this method executes in a goroutine, errors must be handled/logged here and not just returned. Each cycle we need to: 1. verify service is Up and running. 2. capture process metrics 3. run rules 4. trigger any necessary actions */ func (svc *Service) Collect(silenced bool, completeCallback func(Checkable)) { defer completeCallback(svc) if svc.Manager == nil { // Couldn't resolve it when we started up so we can't collect it. return } if svc.Process.Status != services.Up { status, err := svc.Manager.LookupService(svc.Name()) if err != nil { util.Warn("%s", err) } else { svc.Transition(status, func(et EventType) { if !silenced { counters.Add("events", 1) err = svc.EventHandler.Trigger(&Event{et, svc, nil}) if err != nil { util.Warn("Error firing event: %s", err.Error()) } } }) } } if svc.Process.Status == services.Up { merr := svc.Metrics().Collect(svc.Process.Pid) if merr != nil { err := syscall.Kill(svc.Process.Pid, syscall.Signal(0)) if err != nil { // Process disappeared in the last cycle, mark it as Down. util.Info("Service %s with process %d does not exist: %s", svc.Name(), svc.Process.Pid, err) svc.Transition(services.WithStatus(0, services.Down), func(et EventType) { if !silenced { counters.Add("events", 1) err = svc.EventHandler.Trigger(&Event{et, svc, nil}) if err != nil { util.Warn("Error firing event: %s", err.Error()) } } }) // Immediately try to find the replacement PID so we don't have // to wait for another cycle to mark it as Up. status, err := svc.Manager.LookupService(svc.Name()) if err != nil { util.Warn("%s", err) } else { svc.Transition(status, func(et EventType) { if !silenced { counters.Add("events", 1) err = svc.EventHandler.Trigger(&Event{et, svc, nil}) if err != nil { util.Warn("Error firing event: %s", err.Error()) } } }) } } else { util.Warn("Error capturing metrics for process %d: %s", svc.Process.Pid, merr) } } } }