// AddMeta adds a metadata entry to memory, which is queued for later sending. func AddMeta(metric string, tags opentsdb.TagSet, name string, value interface{}, setHost bool) { if tags == nil { tags = make(opentsdb.TagSet) } if _, present := tags["host"]; setHost && !present { tags["host"] = util.Hostname } if err := tags.Clean(); err != nil { slog.Error(err) return } ts := tags.Tags() metalock.Lock() defer metalock.Unlock() prev, present := metadata[Metakey{metric, ts, name}] if present && !reflect.DeepEqual(prev, value) { slog.Infof("metadata changed for %s/%s/%s: %v to %v", metric, ts, name, prev, value) go sendMetadata([]Metasend{{ Metric: metric, Tags: tags, Name: name, Value: value, }}) } else if metadebug { slog.Infof("AddMeta for %s/%s/%s: %v", metric, ts, name, value) } metadata[Metakey{metric, ts, name}] = value }
// MakeSaveCommandHook takes a fuction based on the command name and will run it on save passing files, user, // message, args... as arguments to the command. For the SaveHook function that is returned, If the command fails // to execute or returns a non normal output then an error is returned. func MakeSaveCommandHook(cmdName string) (f SaveHook, err error) { _, err = exec.LookPath(cmdName) if err != nil { return f, fmt.Errorf("command %v not found, failed to create save hook: %v", cmdName, err) } f = func(files, user, message string, args ...string) error { cArgs := []string{files, user, message} cArgs = append(cArgs, args...) slog.Infof("executing save hook %v\n", cmdName) c := exec.Command(cmdName, cArgs...) var cOut bytes.Buffer var cErr bytes.Buffer c.Stdout = &cOut c.Stderr = &cErr err := c.Start() if err != nil { return err } err = c.Wait() if err != nil { slog.Warning(cErr.String()) return err } slog.Infof("save hook ouput: %v\n", cOut.String()) return nil } return }
func (s *Schedule) CheckAlert(T miniprofiler.Timer, r *RunHistory, a *conf.Alert) { slog.Infof("check alert %v start", a.Name) start := utcNow() for _, ak := range s.findUnknownAlerts(r.Start, a.Name) { r.Events[ak] = &models.Event{Status: models.StUnknown} } var warns, crits models.AlertKeys d, err := s.executeExpr(T, r, a, a.Depends) var deps expr.ResultSlice if err == nil { deps = filterDependencyResults(d) crits, err = s.CheckExpr(T, r, a, a.Crit, models.StCritical, nil) if err == nil { warns, err = s.CheckExpr(T, r, a, a.Warn, models.StWarning, crits) } } unevalCount, unknownCount := markDependenciesUnevaluated(r.Events, deps, a.Name) if err != nil { slog.Errorf("Error checking alert %s: %s", a.Name, err.Error()) removeUnknownEvents(r.Events, a.Name) s.markAlertError(a.Name, err) } else { s.markAlertSuccessful(a.Name) } collect.Put("check.duration", opentsdb.TagSet{"name": a.Name}, time.Since(start).Seconds()) slog.Infof("check alert %v done (%s): %v crits, %v warns, %v unevaluated, %v unknown", a.Name, time.Since(start), len(crits), len(warns), unevalCount, unknownCount) }
func runService(name string, isDebug bool) { errFix := fixEventMessageFile(name) //Temp fix. Remove after a few weeks. if errFix != nil { slog.Errorf("%s fixEventMessageFile failed: %v", name, errFix) return } if isDebug { slog.SetEventLog(debug.New(name), 1) } else { elog, err := eventlog.Open(name) if err != nil { return } slog.SetEventLog(elog, 1) defer elog.Close() } slog.Infof("starting %s service version %v (%v)", name, version.Version, version.VersionSHA) run := svc.Run if isDebug { run = debug.Run } err := run(name, &s{}) if err != nil { slog.Errorf("%s service failed: %v", name, err) return } slog.Infof("%s service stopped", name) os.Exit(0) }
func LogComputations(r *Results) { slice := r.Results for _, result := range slice { slog.Infof("Group tags %v\n", result.Group) for _, z := range result.Computations { slog.Infof("%v = %v \n", z.Text, z.Value) } } }
func (s *Schedule) executeTemplates(state *State, event *Event, a *conf.Alert, r *RunHistory) { state.Subject = "" state.Body = "" state.EmailBody = nil state.EmailSubject = nil state.Attachments = nil if event.Status != StUnknown { metric := "template.render" //Render subject endTiming := collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "subject"}) subject, serr := s.ExecuteSubject(r, a, state, false) if serr != nil { slog.Infof("%s: %v", state.AlertKey(), serr) } endTiming() //Render body endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "body"}) body, _, berr := s.ExecuteBody(r, a, state, false) if berr != nil { slog.Infof("%s: %v", state.AlertKey(), berr) } endTiming() //Render email body endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "emailbody"}) emailbody, attachments, merr := s.ExecuteBody(r, a, state, true) if merr != nil { slog.Infof("%s: %v", state.AlertKey(), merr) } endTiming() //Render email subject endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "emailsubject"}) emailsubject, eserr := s.ExecuteSubject(r, a, state, true) endTiming() if serr != nil || berr != nil || merr != nil || eserr != nil { var err error endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "bad"}) subject, body, err = s.ExecuteBadTemplate(serr, berr, r, a, state) endTiming() if err != nil { subject = []byte(fmt.Sprintf("unable to create template error notification: %v", err)) } emailbody = body attachments = nil } state.Subject = string(subject) state.Body = string(body) state.EmailBody = emailbody state.EmailSubject = emailsubject state.Attachments = attachments } }
func (s *Schedule) CheckAlert(T miniprofiler.Timer, r *RunHistory, a *conf.Alert) (cancelled bool) { slog.Infof("check alert %v start", a.Name) start := utcNow() for _, ak := range s.findUnknownAlerts(r.Start, a.Name) { r.Events[ak] = &models.Event{Status: models.StUnknown} } var warns, crits models.AlertKeys type res struct { results *expr.Results error error } // buffered channel so go func that runs executeExpr won't leak if the Check is cancelled // by the closing of the schedule rc := make(chan res, 1) var d *expr.Results var err error go func() { d, err := s.executeExpr(T, r, a, a.Depends) rc <- res{d, err} // this will hang forever if the channel isn't buffered since nothing will ever receieve from rc }() select { case res := <-rc: d = res.results err = res.error // If the schedule closes before the expression has finised executing, we abandon the // execution of the expression case <-s.runnerContext.Done(): return true } var deps expr.ResultSlice if err == nil { deps = filterDependencyResults(d) crits, err, cancelled = s.CheckExpr(T, r, a, a.Crit, models.StCritical, nil) if err == nil && !cancelled { warns, err, cancelled = s.CheckExpr(T, r, a, a.Warn, models.StWarning, crits) } } if cancelled { return true } unevalCount, unknownCount := markDependenciesUnevaluated(r.Events, deps, a.Name) if err != nil { slog.Errorf("Error checking alert %s: %s", a.Name, err.Error()) removeUnknownEvents(r.Events, a.Name) s.markAlertError(a.Name, err) } else { s.markAlertSuccessful(a.Name) } collect.Put("check.duration", opentsdb.TagSet{"name": a.Name}, time.Since(start).Seconds()) slog.Infof("check alert %v done (%s): %v crits, %v warns, %v unevaluated, %v unknown", a.Name, time.Since(start), len(crits), len(warns), unevalCount, unknownCount) return false }
// errRecover is the handler that turns panics into returns from the top // level of Parse. func errRecover(errp *error) { e := recover() if e != nil { switch err := e.(type) { case runtime.Error: slog.Infof("%s: %s", e, debug.Stack()) panic(e) case error: *errp = err default: slog.Infof("%s: %s", e, debug.Stack()) panic(e) } } }
func init() { err := slog.SetSyslog("scollector") if err != nil { slog.Error(err) } slog.Infof("starting %s", version.GetVersionInfo("scollector")) }
func (s *Schedule) save() { if s.db == nil { return } s.Lock("Save") store := map[string]interface{}{ dbMetric: s.Search.Read.Metric, dbTagk: s.Search.Read.Tagk, dbTagv: s.Search.Read.Tagv, dbMetricTags: s.Search.Read.MetricTags, dbNotifications: s.Notifications, dbSilence: s.Silence, dbStatus: s.status, dbMetadata: s.Metadata, dbIncidents: s.Incidents, } tostore := make(map[string][]byte) for name, data := range store { f := new(bytes.Buffer) gz := gzip.NewWriter(f) cw := &counterWriter{w: gz} enc := gob.NewEncoder(cw) if err := enc.Encode(data); err != nil { slog.Errorf("error saving %s: %v", name, err) s.Unlock() return } if err := gz.Flush(); err != nil { slog.Errorf("gzip flush error saving %s: %v", name, err) } if err := gz.Close(); err != nil { slog.Errorf("gzip close error saving %s: %v", name, err) } tostore[name] = f.Bytes() slog.Infof("wrote %s: %v", name, conf.ByteSize(cw.written)) collect.Put("statefile.size", opentsdb.TagSet{"object": name}, cw.written) } s.Unlock() err := s.db.Update(func(tx *bolt.Tx) error { b, err := tx.CreateBucketIfNotExists([]byte(dbBucket)) if err != nil { return err } for name, data := range tostore { if err := b.Put([]byte(name), data); err != nil { return err } } return nil }) if err != nil { slog.Errorf("save db update error: %v", err) return } fi, err := os.Stat(s.Conf.StateFile) if err == nil { collect.Put("statefile.size", opentsdb.TagSet{"object": "total"}, fi.Size()) } slog.Infoln("save to db complete") }
// Command executes the named program with the given arguments. If it does not // exit within timeout, it is sent SIGINT (if supported by Go). After // another timeout, it is killed. func Command(timeout time.Duration, stdin io.Reader, name string, arg ...string) (io.Reader, error) { if _, err := exec.LookPath(name); err != nil { return nil, ErrPath } if Debug { slog.Infof("executing command: %v %v", name, arg) } c := exec.Command(name, arg...) var b bytes.Buffer c.Stdout = &b c.Stdin = stdin done := make(chan error, 1) go func() { done <- c.Run() }() interrupt := time.After(timeout) kill := time.After(timeout * 2) for { select { case err := <-done: return &b, err case <-interrupt: c.Process.Signal(os.Interrupt) case <-kill: // todo: figure out if this can leave the done chan hanging open c.Process.Kill() return nil, ErrTimeout } } }
// Command executes the named program with the given arguments. If it does not // exit within timeout, it is sent SIGINT (if supported by Go). After // another timeout, it is killed. func Command(timeout time.Duration, stdin io.Reader, name string, arg ...string) (io.Reader, error) { if _, err := exec.LookPath(name); err != nil { return nil, ErrPath } if Debug { slog.Infof("executing command: %v %v", name, arg) } c := exec.Command(name, arg...) b := &bytes.Buffer{} c.Stdout = b c.Stdin = stdin if err := c.Start(); err != nil { return nil, err } timedOut := false intTimer := time.AfterFunc(timeout, func() { slog.Errorf("Process taking too long. Interrupting: %s %s", name, strings.Join(arg, " ")) c.Process.Signal(os.Interrupt) timedOut = true }) killTimer := time.AfterFunc(timeout*2, func() { slog.Errorf("Process taking too long. Killing: %s %s", name, strings.Join(arg, " ")) c.Process.Signal(os.Kill) timedOut = true }) err := c.Wait() intTimer.Stop() killTimer.Stop() if timedOut { return nil, ErrTimeout } return b, err }
func (s *Schedule) checkAlert(a *conf.Alert) { checkTime := s.ctx.runTime checkCache := s.ctx.checkCache rh := s.NewRunHistory(checkTime, checkCache) s.CheckAlert(nil, rh, a) start := utcNow() s.RunHistory(rh) slog.Infof("runHistory on %s took %v\n", a.Name, time.Since(start)) }
func sendBatch(batch []*opentsdb.DataPoint) { if Print { for _, d := range batch { j, err := d.MarshalJSON() if err != nil { slog.Error(err) } slog.Info(string(j)) } recordSent(len(batch)) return } now := time.Now() resp, err := SendDataPoints(batch, tsdbURLs[currentTsdbURL]) if err == nil { defer resp.Body.Close() } d := time.Since(now).Nanoseconds() / 1e6 Sample("collect.post.duration", Tags, float64(d)) Add("collect.post.total_duration", Tags, d) Add("collect.post.count", Tags, 1) // Some problem with connecting to the server; retry later. if err != nil || (resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK) { if err != nil { Add("collect.post.error", Tags, 1) slog.Error(err) // Switch endpoint if possible currentTsdbURL = (currentTsdbURL + 1) % len(tsdbURLs) } else if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK { Add("collect.post.bad_status", Tags, 1) slog.Errorln(resp.Status) body, err := ioutil.ReadAll(resp.Body) if err != nil { slog.Error(err) } if len(body) > 0 { slog.Error(string(body)) } // Switch endpoint if possible currentTsdbURL = (currentTsdbURL + 1) % len(tsdbURLs) } restored := 0 for _, msg := range batch { restored++ tchan <- msg } d := time.Second * 5 Add("collect.post.restore", Tags, int64(restored)) slog.Infof("restored %d, sleeping %s", restored, d) time.Sleep(d) return } recordSent(len(batch)) }
func (n *Notification) DoGet(ak string) { resp, err := http.Get(n.Get.String()) if err != nil { slog.Error(err) return } if resp.StatusCode >= 300 { slog.Error("bad response on notification get:", resp.Status) } else { slog.Infof("get notification successful for alert %s. Response code %d.", ak, resp.StatusCode) } }
func (s *Schedule) checkAlert(a *conf.Alert) { checkTime := s.ctx.runTime checkCache := s.ctx.checkCache rh := s.NewRunHistory(checkTime, checkCache) // s.CheckAlert will return early if the schedule has been closed cancelled := s.CheckAlert(nil, rh, a) if cancelled { // Don't runHistory for the alert if expression evaluation has been cancelled return } start := utcNow() s.RunHistory(rh) slog.Infof("runHistory on %s took %v\n", a.Name, time.Since(start)) }
func runService(name string, isDebug bool) { if isDebug { slog.SetEventLog(debug.New(name), 1) } else { elog, err := eventlog.Open(name) if err != nil { return } slog.SetEventLog(elog, 1) defer elog.Close() } slog.Infof("starting service %s%s", name, version.GetVersionInfo("")) run := svc.Run if isDebug { run = debug.Run } err := run(name, &s{}) if err != nil { slog.Errorf("%s service failed: %v", name, err) return } slog.Infof("%s service stopped", name) os.Exit(0) }
// ReadCommandTimeout is the same as ReadCommand with a specifiable timeout. // It can also take a []byte as input (useful for chaining commands). func ReadCommandTimeout(timeout time.Duration, line func(string) error, stdin io.Reader, name string, arg ...string) error { b, err := Command(timeout, stdin, name, arg...) if err != nil { return err } scanner := bufio.NewScanner(b) for scanner.Scan() { if err := line(scanner.Text()); err != nil { return err } } if err := scanner.Err(); err != nil { slog.Infof("%v: %v\n", name, err) } return nil }
// Locks the queue and sends all datapoints. Intended to be used as scollector exits. func Flush() { qlock.Lock() for len(queue) > 0 { i := len(queue) if i > BatchSize { i = BatchSize } sending := queue[:i] queue = queue[i:] if Debug { slog.Infof("sending: %d, remaining: %d", i, len(queue)) } sendBatch(sending) } qlock.Unlock() }
func send() { for { qlock.Lock() if i := len(queue); i > 0 { if i > BatchSize { i = BatchSize } sending := queue[:i] queue = queue[i:] if Debug { slog.Infof("sending: %d, remaining: %d", i, len(queue)) } qlock.Unlock() sendBatch(sending) } else { qlock.Unlock() time.Sleep(time.Second) } } }
func (n *Notification) DoEmail(subject, body []byte, c *Conf, ak string, attachments ...*Attachment) { e := email.NewEmail() e.From = c.EmailFrom for _, a := range n.Email { e.To = append(e.To, a.Address) } e.Subject = string(subject) e.HTML = body for _, a := range attachments { e.Attach(bytes.NewBuffer(a.Data), a.Filename, a.ContentType) } e.Headers.Add("X-Bosun-Server", util.Hostname) if err := Send(e, c.SMTPHost, c.SMTPUsername, c.SMTPPassword); err != nil { collect.Add("email.sent_failed", nil, 1) slog.Errorf("failed to send alert %v to %v %v\n", ak, e.To, err) return } collect.Add("email.sent", nil, 1) slog.Infof("relayed alert %v to %v sucessfully. Subject: %d bytes. Body: %d bytes.", ak, e.To, len(subject), len(body)) }
func send() { for { qlock.Lock() if i := len(queue); i > 0 { if i > BatchSize { i = BatchSize } sending := queue[:i] queue = queue[i:] if Debug { slog.Infof("sending: %d, remaining: %d", i, len(queue)) } qlock.Unlock() Sample("collect.post.batchsize", Tags, float64(len(sending))) sendBatch(sending) } else { qlock.Unlock() time.Sleep(time.Second) } } }
func (n *Notification) DoPost(payload []byte, ak string) { if n.Body != nil { buf := new(bytes.Buffer) if err := n.Body.Execute(buf, string(payload)); err != nil { slog.Errorln(err) return } payload = buf.Bytes() } resp, err := http.Post(n.Post.String(), n.ContentType, bytes.NewBuffer(payload)) if resp != nil && resp.Body != nil { defer resp.Body.Close() } if err != nil { slog.Error(err) return } if resp.StatusCode >= 300 { slog.Errorln("bad response on notification post:", resp.Status) } else { slog.Infof("post notification successful for alert %s. Response code %d.", ak, resp.StatusCode) } }
func (s *Schedule) sendNotifications(silenced SilenceTester) { if s.quiet { slog.Infoln("quiet mode prevented", len(s.pendingNotifications), "notifications") return } for n, states := range s.pendingNotifications { for _, st := range states { ak := st.AlertKey alert := s.RuleConf.GetAlert(ak.Name()) if alert == nil { continue } silenced := silenced(ak) != nil if st.CurrentStatus == models.StUnknown { if silenced { slog.Infoln("silencing unknown", ak) continue } s.pendingUnknowns[n] = append(s.pendingUnknowns[n], st) } else if silenced { slog.Infof("silencing %s", ak) continue } else if !alert.Log && (!st.Open || !st.NeedAck) { slog.Errorf("Cannot notify acked or closed alert %s. Clearing.", ak) if err := s.DataAccess.Notifications().ClearNotifications(ak); err != nil { slog.Error(err) } continue } else { s.notify(st, n) } if n.Next != nil { s.QueueNotification(ak, n.Next, utcNow()) } } } }
func (s *Schedule) RunAlert(a *conf.Alert) { // Add to waitgroup for running alert s.checksRunning.Add(1) // ensure when an alert is done it is removed from the wait group defer s.checksRunning.Done() for { // Calcaulate runEvery based on system default and override if an alert has a // custom runEvery runEvery := s.SystemConf.GetDefaultRunEvery() if a.RunEvery != 0 { runEvery = a.RunEvery } wait := time.After(s.SystemConf.GetCheckFrequency() * time.Duration(runEvery)) s.checkAlert(a) s.LastCheck = utcNow() select { case <-wait: case <-s.runnerContext.Done(): // If an alert is waiting we cancel it slog.Infof("Stopping alert routine for %v\n", a.Name) return } } }
func c_awsBilling(accessKey, secretKey, region, productCodes, bucketName, bucketPath string, purgeDays int) (opentsdb.MultiDataPoint, error) { creds := credentials.NewStaticCredentials(accessKey, secretKey, "") conf := &aws.Config{ Credentials: creds, Region: ®ion, } awsBilling := awsBillingConfig{ bucketName: bucketName, bucketPath: bucketPath, } regCompiled, err := regexp.Compile(productCodes) if err != nil { return nil, err } awsBilling.prodCodesReg = regCompiled awsBilling.s3svc = s3.New(session.New(conf)) //Connect to S3 if awsBilling.s3svc == nil { return nil, fmt.Errorf("unable to connect to S3") } awsBilling.r53svc = route53.New(session.New(conf)) //Connect to R53 if awsBilling.r53svc == nil { return nil, fmt.Errorf("unable to connect to Route 53") } awsBilling.downloader = s3manager.NewDownloader(session.New(conf)) //Gimmie a downloader if awsBilling.downloader == nil { return nil, fmt.Errorf("unable to create S3 downloader") } if purgeDays == 0 { slog.Infof("S3 purging of objects is disabled") awsBilling.purgeOlderThan = time.Date(2999, 12, 31, 23, 59, 59, 0, time.UTC) } else { purgeHours := time.Duration(-1 * 24 * purgeDays) awsBilling.purgeOlderThan = time.Now().Add(purgeHours * time.Hour) } return awsBilling.Check() }
func (s *Schedule) executeTemplates(state *models.IncidentState, event *models.Event, a *conf.Alert, r *RunHistory) { if event.Status != models.StUnknown { var errs []error metric := "template.render" //Render subject endTiming := collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "subject"}) subject, err := s.ExecuteSubject(r, a, state, false) if err != nil { slog.Infof("%s: %v", state.AlertKey, err) errs = append(errs, err) } else if subject == nil { err = fmt.Errorf("Empty subject on %s", state.AlertKey) slog.Error(err) errs = append(errs, err) } endTiming() //Render body endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "body"}) body, _, err := s.ExecuteBody(r, a, state, false) if err != nil { slog.Infof("%s: %v", state.AlertKey, err) errs = append(errs, err) } else if subject == nil { err = fmt.Errorf("Empty body on %s", state.AlertKey) slog.Error(err) errs = append(errs, err) } endTiming() //Render email body endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "emailbody"}) emailbody, attachments, err := s.ExecuteBody(r, a, state, true) if err != nil { slog.Infof("%s: %v", state.AlertKey, err) errs = append(errs, err) } else if subject == nil { err = fmt.Errorf("Empty email body on %s", state.AlertKey) slog.Error(err) errs = append(errs, err) } endTiming() //Render email subject endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "emailsubject"}) emailsubject, err := s.ExecuteSubject(r, a, state, true) if err != nil { slog.Infof("%s: %v", state.AlertKey, err) errs = append(errs, err) } else if subject == nil { err = fmt.Errorf("Empty email subject on %s", state.AlertKey) slog.Error(err) errs = append(errs, err) } endTiming() if errs != nil { endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "bad"}) subject, body, err = s.ExecuteBadTemplate(errs, r, a, state) endTiming() if err != nil { subject = []byte(fmt.Sprintf("unable to create template error notification: %v", err)) } emailbody = body attachments = nil } state.Subject = string(subject) state.Body = string(body) //don't save email seperately if they are identical if string(state.EmailBody) != state.Body { state.EmailBody = emailbody } if string(state.EmailSubject) != state.Subject { state.EmailSubject = emailsubject } state.Attachments = attachments } }
// RunHistory for a single alert key. Returns true if notifications were altered. func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.Event, silenced SilenceTester) (checkNotify bool, err error) { event.Time = r.Start a := s.Conf.Alerts[ak.Name()] if a.UnknownsNormal && event.Status == models.StUnknown { event.Status = models.StNormal } data := s.DataAccess.State() err = data.TouchAlertKey(ak, utcNow()) if err != nil { return } si := silenced(ak) // get existing open incident if exists var incident *models.IncidentState incident, err = data.GetOpenIncident(ak) if err != nil { return } defer func() { // save unless incident is new and closed (log alert) if incident != nil && (incident.Id != 0 || incident.Open) { _, err = data.UpdateIncidentState(incident) } else { err = data.SetUnevaluated(ak, event.Unevaluated) // if nothing to save, at least store the unevaluated state } }() // If nothing is out of the ordinary we are done if event.Status <= models.StNormal && incident == nil { return } // if event is unevaluated, we are done also. if incident != nil { incident.Unevaluated = event.Unevaluated } if event.Unevaluated { return } shouldNotify := false newIncident := false if incident == nil { incident = NewIncident(ak) newIncident = true shouldNotify = true } // VICTOROPS INTEGRATION: Enables notification of incidents which have returned to normal (Sends normNotification defined in config) if event.Status <= models.StNormal && (incident.CurrentStatus == models.StWarning || incident.CurrentStatus == models.StCritical) { slog.Infof("TRIGGER_RESOLVED: from %s to %s", incident.CurrentStatus, event.Status) shouldNotify = true } // VICTOROPS INTEGRATION: Enables notification of Incidents which have returned to normal but are now back to warning or critical. i.e. enable Flapping if incident.CurrentStatus == models.StNormal && (event.Status == models.StCritical || event.Status == models.StWarning) { slog.Infof("TRIGGER_REALERT: from %s to %s", incident.CurrentStatus, event.Status) shouldNotify = true } // set state.Result according to event result if event.Status == models.StCritical { incident.Result = event.Crit } else if event.Status == models.StWarning { incident.Result = event.Warn } if event.Status > models.StNormal { incident.LastAbnormalStatus = event.Status incident.LastAbnormalTime = event.Time.UTC().Unix() } if event.Status > incident.WorstStatus { incident.WorstStatus = event.Status shouldNotify = true } if event.Status != incident.CurrentStatus { incident.Events = append(incident.Events, *event) } incident.CurrentStatus = event.Status //run a preliminary save on new incidents to get an id if newIncident { if a.Log || silencedOrIgnored(a, event, si) { //a log or silenced/ignored alert will not need to be saved } else { incident.Id, err = s.DataAccess.State().UpdateIncidentState(incident) if err != nil { return } } } //render templates and open alert key if abnormal if event.Status > models.StNormal { s.executeTemplates(incident, event, a, r) incident.Open = true if a.Log { incident.Open = false } } // On state increase, clear old notifications and notify current. // Do nothing if state did not change. notify := func(ns *conf.Notifications) { if a.Log { lastLogTime := s.lastLogTimes[ak] now := utcNow() if now.Before(lastLogTime.Add(a.MaxLogFrequency)) { return } s.lastLogTimes[ak] = now } nots := ns.Get(s.Conf, incident.AlertKey.Group()) for _, n := range nots { s.Notify(incident, n) checkNotify = true } } notifyCurrent := func() { //Auto close ignoreUnknowns for new incident. if silencedOrIgnored(a, event, si) { incident.Open = false return } // VICTOROPS INTEGRATION incident.NeedAck = false switch event.Status { case models.StCritical, models.StUnknown: notify(a.CritNotification) case models.StWarning: notify(a.WarnNotification) case models.StNormal: // VICTOROPS INTEGRATION incident.NeedAck = false notify(a.NormNotification) } } // lock while we change notifications. s.Lock("RunHistory") if shouldNotify { incident.NeedAck = false if err = s.DataAccess.Notifications().ClearNotifications(ak); err != nil { return } notifyCurrent() } // finally close an open alert with silence once it goes back to normal. if si := silenced(ak); si != nil && event.Status == models.StNormal { go func(ak models.AlertKey) { slog.Infof("auto close %s because was silenced", ak) err := s.Action("bosun", "Auto close because was silenced.", models.ActionClose, ak) if err != nil { slog.Errorln(err) } }(ak) } s.Unlock() return checkNotify, nil }
func main() { flag.Parse() if *flagToToml != "" { toToml(*flagToToml) fmt.Println("toml conversion complete; remove all empty values by hand (empty strings, 0)") return } if *flagPrint || *flagDebug { slog.Set(&slog.StdLog{Log: log.New(os.Stdout, "", log.LstdFlags)}) } if *flagVersion { fmt.Println(version.GetVersionInfo("scollector")) os.Exit(0) } for _, m := range mains { m() } conf := readConf() if *flagHost != "" { conf.Host = *flagHost } if *flagFilter != "" { conf.Filter = strings.Split(*flagFilter, ",") } if !conf.Tags.Valid() { slog.Fatalf("invalid tags: %v", conf.Tags) } else if conf.Tags["host"] != "" { slog.Fatalf("host not supported in custom tags, use Hostname instead") } if conf.PProf != "" { go func() { slog.Infof("Starting pprof at http://%s/debug/pprof/", conf.PProf) slog.Fatal(http.ListenAndServe(conf.PProf, nil)) }() } collectors.AddTags = conf.Tags util.FullHostname = conf.FullHost util.Set() if conf.Hostname != "" { util.Hostname = conf.Hostname } if err := collect.SetHostname(util.Hostname); err != nil { slog.Fatal(err) } if conf.ColDir != "" { collectors.InitPrograms(conf.ColDir) } var err error check := func(e error) { if e != nil { err = e } } collectors.Init(conf) for _, r := range conf.MetricFilters { check(collectors.AddMetricFilters(r)) } for _, rmq := range conf.RabbitMQ { check(collectors.RabbitMQ(rmq.URL)) } for _, cfg := range conf.SNMP { check(collectors.SNMP(cfg, conf.MIBS)) } for _, i := range conf.ICMP { check(collectors.ICMP(i.Host)) } for _, a := range conf.AWS { check(collectors.AWS(a.AccessKey, a.SecretKey, a.Region)) } for _, v := range conf.Vsphere { check(collectors.Vsphere(v.User, v.Password, v.Host)) } for _, p := range conf.Process { check(collectors.AddProcessConfig(p)) } for _, p := range conf.ProcessDotNet { check(collectors.AddProcessDotNetConfig(p)) } for _, h := range conf.HTTPUnit { if h.TOML != "" { check(collectors.HTTPUnitTOML(h.TOML)) } if h.Hiera != "" { check(collectors.HTTPUnitHiera(h.Hiera)) } } for _, r := range conf.Riak { check(collectors.Riak(r.URL)) } for _, x := range conf.ExtraHop { check(collectors.ExtraHop(x.Host, x.APIKey, x.FilterBy, x.FilterPercent)) } if err != nil { slog.Fatal(err) } collectors.KeepalivedCommunity = conf.KeepalivedCommunity // Add all process collectors. This is platform specific. collectors.WatchProcesses() collectors.WatchProcessesDotNet() if *flagFake > 0 { collectors.InitFake(*flagFake) } collect.Debug = *flagDebug util.Debug = *flagDebug collect.DisableDefaultCollectors = conf.DisableSelf c := collectors.Search(conf.Filter) if len(c) == 0 { slog.Fatalf("Filter %v matches no collectors.", conf.Filter) } for _, col := range c { col.Init() } u, err := parseHost(conf.Host) if *flagList { list(c) return } else if *flagPrint { u = &url.URL{Scheme: "http", Host: "localhost:0"} } else if err != nil { slog.Fatalf("invalid host %v: %v", conf.Host, err) } freq := time.Second * time.Duration(conf.Freq) if freq <= 0 { slog.Fatal("freq must be > 0") } collectors.DefaultFreq = freq collect.Freq = freq if conf.BatchSize < 0 { slog.Fatal("BatchSize must be > 0") } if conf.BatchSize != 0 { collect.BatchSize = conf.BatchSize } collect.Tags = conf.Tags.Copy().Merge(opentsdb.TagSet{"os": runtime.GOOS}) if *flagPrint { collect.Print = true } if !*flagDisableMetadata { if err := metadata.Init(u, *flagDebug); err != nil { slog.Fatal(err) } } cdp, cquit := collectors.Run(c) if u != nil { slog.Infoln("OpenTSDB host:", u) } if err := collect.InitChan(u, "scollector", cdp); err != nil { slog.Fatal(err) } if version.VersionDate != "" { v, err := strconv.ParseInt(version.VersionDate, 10, 64) if err == nil { go func() { metadata.AddMetricMeta("scollector.version", metadata.Gauge, metadata.None, "Scollector version number, which indicates when scollector was built.") for { if err := collect.Put("version", collect.Tags, v); err != nil { slog.Error(err) } time.Sleep(time.Hour) } }() } } if *flagBatchSize > 0 { collect.BatchSize = *flagBatchSize } go func() { const maxMem = 500 * 1024 * 1024 // 500MB var m runtime.MemStats for range time.Tick(time.Minute) { runtime.ReadMemStats(&m) if m.Alloc > maxMem { panic("memory max reached") } } }() sChan := make(chan os.Signal) signal.Notify(sChan, os.Interrupt) <-sChan close(cquit) // try to flush all datapoints on sigterm, but quit after 5 seconds no matter what. time.AfterFunc(5*time.Second, func() { os.Exit(0) }) collect.Flush() }
// RunHistory for a single alert key. Returns true if notifications were altered. func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *Event, silenced map[models.AlertKey]models.Silence) bool { checkNotify := false // get existing state object for alert key. add to schedule status if doesn't already exist state := s.GetStatus(ak) if state == nil { state = NewStatus(ak) s.SetStatus(ak, state) } defer s.SetStatus(ak, state) // make sure we always touch the state. state.Touched = r.Start // set state.Result according to event result if event.Crit != nil { state.Result = event.Crit } else if event.Warn != nil { state.Result = event.Warn } // if event is unevaluated, we are done. state.Unevaluated = event.Unevaluated if event.Unevaluated { return checkNotify } // assign incident id to new event if applicable prev := state.Last() worst := StNormal event.Time = r.Start if prev.IncidentId != 0 { // If last event has incident id and is not closed, we continue it. incident, err := s.DataAccess.Incidents().GetIncident(prev.IncidentId) if err != nil { slog.Error(err) } else if incident.End == nil { event.IncidentId = prev.IncidentId worst = state.WorstThisIncident() } } if event.IncidentId == 0 && event.Status != StNormal { incident, err := s.createIncident(ak, event.Time) if err != nil { slog.Error("Error creating incident", err) } else { event.IncidentId = incident.Id } } state.Append(event) a := s.Conf.Alerts[ak.Name()] // render templates and open alert key if abnormal if event.Status > StNormal { s.executeTemplates(state, event, a, r) state.Open = true if a.Log { worst = StNormal state.Open = false } } // On state increase, clear old notifications and notify current. // If the old alert was not acknowledged, do nothing. // Do nothing if state did not change. notify := func(ns *conf.Notifications) { if a.Log { lastLogTime := state.LastLogTime now := time.Now() if now.Before(lastLogTime.Add(a.MaxLogFrequency)) { return } state.LastLogTime = now } nots := ns.Get(s.Conf, state.Group) for _, n := range nots { s.Notify(state, n) checkNotify = true } } notifyCurrent := func() { // Auto close ignoreUnknowns. if a.IgnoreUnknown && event.Status == StUnknown { state.Open = false state.Forgotten = true state.NeedAck = false state.Action("bosun", "Auto close because alert has ignoreUnknown.", ActionClose, event.Time) slog.Infof("auto close %s because alert has ignoreUnknown", ak) return } else if silenced[ak].Forget && event.Status == StUnknown { state.Open = false state.Forgotten = true state.NeedAck = false state.Action("bosun", "Auto close because alert is silenced and marked auto forget.", ActionClose, event.Time) slog.Infof("auto close %s because alert is silenced and marked auto forget", ak) return } state.NeedAck = true switch event.Status { case StCritical, StUnknown: notify(a.CritNotification) case StWarning: notify(a.WarnNotification) } } clearOld := func() { state.NeedAck = false delete(s.Notifications, ak) } // lock while we change notifications. s.Lock("RunHistory") if event.Status > worst { clearOld() notifyCurrent() } else if _, ok := silenced[ak]; ok && event.Status == StNormal { go func(ak models.AlertKey) { slog.Infof("auto close %s because was silenced", ak) err := s.Action("bosun", "Auto close because was silenced.", ActionClose, ak) if err != nil { slog.Errorln(err) } }(ak) } s.Unlock() return checkNotify }