func (s *Schedule) notify(st *models.IncidentState, n *conf.Notification) { if len(st.EmailSubject) == 0 { st.EmailSubject = []byte(st.Subject) } if len(st.EmailBody) == 0 { st.EmailBody = []byte(st.Body) } n.Notify(st.Subject, st.Body, st.EmailSubject, st.EmailBody, s.SystemConf, string(st.AlertKey), st.Attachments...) }
func Status(t miniprofiler.Timer, w http.ResponseWriter, r *http.Request) (interface{}, error) { r.ParseForm() type ExtStatus struct { AlertName string *models.IncidentState } m := make(map[string]ExtStatus) for _, k := range r.Form["ak"] { ak, err := models.ParseAlertKey(k) if err != nil { return nil, err } var state *models.IncidentState if r.FormValue("all") != "" { allInc, err := schedule.DataAccess.State().GetAllIncidents(ak) if err != nil { return nil, err } if len(allInc) == 0 { return nil, fmt.Errorf("No incidents for alert key") } state = allInc[0] allEvents := models.EventsByTime{} for _, inc := range allInc { for _, e := range inc.Events { allEvents = append(allEvents, e) } } sort.Sort(allEvents) state.Events = allEvents } else { state, err = schedule.DataAccess.State().GetLatestIncident(ak) if err != nil { return nil, err } } st := ExtStatus{IncidentState: state} if st.IncidentState == nil { return nil, fmt.Errorf("unknown alert key: %v", k) } st.AlertName = ak.Name() m[k] = st } return m, nil }
func (s *Schedule) action(user, message string, t models.ActionType, st *models.IncidentState) (ak models.AlertKey, e error) { if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": st.AlertKey.Name(), "type": t.String()}, 1); err != nil { slog.Errorln(err) } defer func() { if e == nil { if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": st.AlertKey.Name(), "type": t.String()}, 1); err != nil { slog.Errorln(err) } if err := s.DataAccess.Notifications().ClearNotifications(st.AlertKey); err != nil { e = err } } }() isUnknown := st.LastAbnormalStatus == models.StUnknown timestamp := utcNow() switch t { case models.ActionAcknowledge: if !st.NeedAck { return "", fmt.Errorf("alert already acknowledged") } if !st.Open { return "", fmt.Errorf("cannot acknowledge closed alert") } st.NeedAck = false case models.ActionClose: if st.IsActive() { return "", fmt.Errorf("cannot close active alert") } fallthrough case models.ActionForceClose: st.Open = false st.End = ×tamp case models.ActionForget: if !isUnknown { return "", fmt.Errorf("can only forget unknowns") } fallthrough case models.ActionPurge: return st.AlertKey, s.DataAccess.State().Forget(st.AlertKey) case models.ActionNote: // pass default: return "", fmt.Errorf("unknown action type: %v", t) } st.Actions = append(st.Actions, models.Action{ Message: message, Time: timestamp, Type: t, User: user, }) _, err := s.DataAccess.State().UpdateIncidentState(st) return st.AlertKey, err }
func (s *Schedule) executeTemplates(state *models.IncidentState, event *models.Event, a *conf.Alert, r *RunHistory) { if event.Status != models.StUnknown { var errs []error metric := "template.render" //Render subject endTiming := collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "subject"}) subject, err := s.ExecuteSubject(r, a, state, false) if err != nil { slog.Infof("%s: %v", state.AlertKey, err) errs = append(errs, err) } else if subject == nil { err = fmt.Errorf("Empty subject on %s", state.AlertKey) slog.Error(err) errs = append(errs, err) } endTiming() //Render body endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "body"}) body, _, err := s.ExecuteBody(r, a, state, false) if err != nil { slog.Infof("%s: %v", state.AlertKey, err) errs = append(errs, err) } else if subject == nil { err = fmt.Errorf("Empty body on %s", state.AlertKey) slog.Error(err) errs = append(errs, err) } endTiming() //Render email body endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "emailbody"}) emailbody, attachments, err := s.ExecuteBody(r, a, state, true) if err != nil { slog.Infof("%s: %v", state.AlertKey, err) errs = append(errs, err) } else if subject == nil { err = fmt.Errorf("Empty email body on %s", state.AlertKey) slog.Error(err) errs = append(errs, err) } endTiming() //Render email subject endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "emailsubject"}) emailsubject, err := s.ExecuteSubject(r, a, state, true) if err != nil { slog.Infof("%s: %v", state.AlertKey, err) errs = append(errs, err) } else if subject == nil { err = fmt.Errorf("Empty email subject on %s", state.AlertKey) slog.Error(err) errs = append(errs, err) } endTiming() if errs != nil { endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "bad"}) subject, body, err = s.ExecuteBadTemplate(errs, r, a, state) endTiming() if err != nil { subject = []byte(fmt.Sprintf("unable to create template error notification: %v", err)) } emailbody = body attachments = nil } state.Subject = string(subject) state.Body = string(body) //don't save email seperately if they are identical if string(state.EmailBody) != state.Body { state.EmailBody = emailbody } if string(state.EmailSubject) != state.Subject { state.EmailSubject = emailsubject } state.Attachments = attachments } }
// RunHistory for a single alert key. Returns true if notifications were altered. func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.Event, silenced SilenceTester) (checkNotify bool, err error) { event.Time = r.Start a := s.Conf.Alerts[ak.Name()] if a.UnknownsNormal && event.Status == models.StUnknown { event.Status = models.StNormal } data := s.DataAccess.State() err = data.TouchAlertKey(ak, utcNow()) if err != nil { return } si := silenced(ak) // get existing open incident if exists var incident *models.IncidentState incident, err = data.GetOpenIncident(ak) if err != nil { return } defer func() { // save unless incident is new and closed (log alert) if incident != nil && (incident.Id != 0 || incident.Open) { _, err = data.UpdateIncidentState(incident) } else { err = data.SetUnevaluated(ak, event.Unevaluated) // if nothing to save, at least store the unevaluated state } }() // If nothing is out of the ordinary we are done if event.Status <= models.StNormal && incident == nil { return } // if event is unevaluated, we are done also. if incident != nil { incident.Unevaluated = event.Unevaluated } if event.Unevaluated { return } shouldNotify := false newIncident := false if incident == nil { incident = NewIncident(ak) newIncident = true shouldNotify = true } // VICTOROPS INTEGRATION: Enables notification of incidents which have returned to normal (Sends normNotification defined in config) if event.Status <= models.StNormal && (incident.CurrentStatus == models.StWarning || incident.CurrentStatus == models.StCritical) { slog.Infof("TRIGGER_RESOLVED: from %s to %s", incident.CurrentStatus, event.Status) shouldNotify = true } // VICTOROPS INTEGRATION: Enables notification of Incidents which have returned to normal but are now back to warning or critical. i.e. enable Flapping if incident.CurrentStatus == models.StNormal && (event.Status == models.StCritical || event.Status == models.StWarning) { slog.Infof("TRIGGER_REALERT: from %s to %s", incident.CurrentStatus, event.Status) shouldNotify = true } // set state.Result according to event result if event.Status == models.StCritical { incident.Result = event.Crit } else if event.Status == models.StWarning { incident.Result = event.Warn } if event.Status > models.StNormal { incident.LastAbnormalStatus = event.Status incident.LastAbnormalTime = event.Time.UTC().Unix() } if event.Status > incident.WorstStatus { incident.WorstStatus = event.Status shouldNotify = true } if event.Status != incident.CurrentStatus { incident.Events = append(incident.Events, *event) } incident.CurrentStatus = event.Status //run a preliminary save on new incidents to get an id if newIncident { if a.Log || silencedOrIgnored(a, event, si) { //a log or silenced/ignored alert will not need to be saved } else { incident.Id, err = s.DataAccess.State().UpdateIncidentState(incident) if err != nil { return } } } //render templates and open alert key if abnormal if event.Status > models.StNormal { s.executeTemplates(incident, event, a, r) incident.Open = true if a.Log { incident.Open = false } } // On state increase, clear old notifications and notify current. // Do nothing if state did not change. notify := func(ns *conf.Notifications) { if a.Log { lastLogTime := s.lastLogTimes[ak] now := utcNow() if now.Before(lastLogTime.Add(a.MaxLogFrequency)) { return } s.lastLogTimes[ak] = now } nots := ns.Get(s.Conf, incident.AlertKey.Group()) for _, n := range nots { s.Notify(incident, n) checkNotify = true } } notifyCurrent := func() { //Auto close ignoreUnknowns for new incident. if silencedOrIgnored(a, event, si) { incident.Open = false return } // VICTOROPS INTEGRATION incident.NeedAck = false switch event.Status { case models.StCritical, models.StUnknown: notify(a.CritNotification) case models.StWarning: notify(a.WarnNotification) case models.StNormal: // VICTOROPS INTEGRATION incident.NeedAck = false notify(a.NormNotification) } } // lock while we change notifications. s.Lock("RunHistory") if shouldNotify { incident.NeedAck = false if err = s.DataAccess.Notifications().ClearNotifications(ak); err != nil { return } notifyCurrent() } // finally close an open alert with silence once it goes back to normal. if si := silenced(ak); si != nil && event.Status == models.StNormal { go func(ak models.AlertKey) { slog.Infof("auto close %s because was silenced", ak) err := s.Action("bosun", "Auto close because was silenced.", models.ActionClose, ak) if err != nil { slog.Errorln(err) } }(ak) } s.Unlock() return checkNotify, nil }
func procRule(t miniprofiler.Timer, c *conf.Conf, a *conf.Alert, now time.Time, summary bool, email string, template_group string) (*ruleResult, error) { s := &sched.Schedule{} s.DataAccess = schedule.DataAccess s.Search = schedule.Search if err := s.Init(c); err != nil { return nil, err } rh := s.NewRunHistory(now, cacheObj) if _, err := s.CheckExpr(t, rh, a, a.Warn, models.StWarning, nil); err != nil { return nil, err } if _, err := s.CheckExpr(t, rh, a, a.Crit, models.StCritical, nil); err != nil { return nil, err } keys := make(models.AlertKeys, len(rh.Events)) criticals, warnings, normals := make([]models.AlertKey, 0), make([]models.AlertKey, 0), make([]models.AlertKey, 0) i := 0 for k, v := range rh.Events { v.Time = now keys[i] = k i++ switch v.Status { case models.StNormal: normals = append(normals, k) case models.StWarning: warnings = append(warnings, k) case models.StCritical: criticals = append(criticals, k) default: return nil, fmt.Errorf("unknown state type %v", v.Status) } } sort.Sort(keys) var subject, body []byte var data interface{} warning := make([]string, 0) if !summary && len(keys) > 0 { var primaryIncident *models.IncidentState if template_group != "" { ts, err := opentsdb.ParseTags(template_group) if err != nil { return nil, err } for _, ak := range keys { if ak.Group().Subset(ts) { primaryIncident = sched.NewIncident(ak) primaryIncident.Events = []models.Event{*rh.Events[ak]} break } } } if primaryIncident == nil { primaryIncident = sched.NewIncident(keys[0]) primaryIncident.Events = []models.Event{*rh.Events[keys[0]]} if template_group != "" { warning = append(warning, fmt.Sprintf("template group %s was not a subset of any result", template_group)) } } if e := primaryIncident.Events[0]; e.Crit != nil { primaryIncident.Result = e.Crit } else if e.Warn != nil { primaryIncident.Result = e.Warn } var b_err, s_err error func() { defer func() { if err := recover(); err != nil { s := fmt.Sprint(err) warning = append(warning, s) b_err = fmt.Errorf(s) } }() if body, _, b_err = s.ExecuteBody(rh, a, primaryIncident, false); b_err != nil { warning = append(warning, b_err.Error()) } }() func() { defer func() { if err := recover(); err != nil { s := fmt.Sprint(err) warning = append(warning, s) s_err = fmt.Errorf(s) } }() subject, s_err = s.ExecuteSubject(rh, a, primaryIncident, false) if s_err != nil { warning = append(warning, s_err.Error()) } }() if s_err != nil || b_err != nil { var err error subject, body, err = s.ExecuteBadTemplate([]error{s_err, b_err}, rh, a, primaryIncident) if err != nil { subject = []byte(fmt.Sprintf("unable to create tempalate error notification: %v", err)) } } else if email != "" { m, err := mail.ParseAddress(email) if err != nil { return nil, err } n := conf.Notification{ Email: []*mail.Address{m}, } email, attachments, b_err := s.ExecuteBody(rh, a, primaryIncident, true) email_subject, s_err := s.ExecuteSubject(rh, a, primaryIncident, true) if b_err != nil { warning = append(warning, b_err.Error()) } else if s_err != nil { warning = append(warning, s_err.Error()) } else { n.DoEmail(email_subject, email, schedule.Conf, string(primaryIncident.AlertKey), attachments...) } } data = s.Data(rh, primaryIncident, a, false) } return &ruleResult{ criticals, warnings, normals, now, string(body), string(subject), data, rh.Events, warning, }, nil }
func (d *dataAccess) save(s *models.IncidentState, isImport bool) error { defer collect.StartTimer("redis", opentsdb.TagSet{"op": "UpdateIncident"})() conn := d.GetConnection() defer conn.Close() isNew := false //if id is still zero, assign new id. if s.Id == 0 { id, err := redis.Int64(conn.Do("INCR", "maxIncidentId")) if err != nil { return slog.Wrap(err) } s.Id = id isNew = true } else if isImport { max, err := redis.Int64(conn.Do("GET", "maxIncidentId")) if err != nil { max = 0 } if max < s.Id { if _, err = conn.Do("SET", "maxIncidentId", s.Id); err != nil { return slog.Wrap(err) } } isNew = true } return d.transact(conn, func() error { if isNew { // add to list for alert key if _, err := conn.Do("LPUSH", incidentsForAlertKeyKey(s.AlertKey), s.Id); err != nil { return slog.Wrap(err) } dat := fmt.Sprintf("%d:%d:%s", s.Id, s.Start.UTC().Unix(), s.AlertKey) if _, err := conn.Do("LPUSH", "allIncidents", dat); err != nil { return slog.Wrap(err) } } // store the incident json data, err := json.Marshal(s) if err != nil { return slog.Wrap(err) } _, err = conn.Do("SET", incidentStateKey(s.Id), data) addRem := func(b bool) string { if b { return "SADD" } return "SREM" } // appropriately add or remove it from the "open" set if s.Open { if _, err = conn.Do("HSET", statesOpenIncidentsKey, s.AlertKey, s.Id); err != nil { return slog.Wrap(err) } } else { if _, err = conn.Do("HDEL", statesOpenIncidentsKey, s.AlertKey); err != nil { return slog.Wrap(err) } } //appropriately add or remove from unknown and uneval sets if _, err = conn.Do(addRem(s.CurrentStatus == models.StUnknown), statesUnknownKey(s.Alert), s.AlertKey); err != nil { return slog.Wrap(err) } if _, err = conn.Do(addRem(s.Unevaluated), statesUnevalKey(s.Alert), s.AlertKey); err != nil { return slog.Wrap(err) } return nil }) }