func (d *dataAccess) TouchAlertKey(ak models.AlertKey, t time.Time) error { conn := d.Get() defer conn.Close() _, err := conn.Do("ZADD", statesLastTouchedKey(ak.Name()), t.UTC().Unix(), string(ak)) return slog.Wrap(err) }
func NewStatus(ak models.AlertKey) *State { g := ak.Group() return &State{ Alert: ak.Name(), Tags: g.Tags(), Group: g, } }
func (d *dataAccess) TouchAlertKey(ak models.AlertKey, t time.Time) error { defer collect.StartTimer("redis", opentsdb.TagSet{"op": "TouchAlertKey"})() conn := d.GetConnection() defer conn.Close() _, err := conn.Do("ZADD", statesLastTouchedKey(ak.Name()), t.UTC().Unix(), string(ak)) return slog.Wrap(err) }
func NewIncident(ak models.AlertKey) *models.IncidentState { s := &models.IncidentState{} s.Start = utcNow() s.AlertKey = ak s.Alert = ak.Name() s.Tags = ak.Group().Tags() s.Result = &models.Result{} return s }
func (s *Schedule) Action(user, message string, t models.ActionType, ak models.AlertKey) error { if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil { slog.Errorln(err) } st, err := s.DataAccess.State().GetLatestIncident(ak) if err != nil { return err } if st == nil { return fmt.Errorf("no such alert key: %v", ak) } isUnknown := st.LastAbnormalStatus == models.StUnknown timestamp := utcNow() switch t { case models.ActionAcknowledge: if !st.NeedAck { return fmt.Errorf("alert already acknowledged") } if !st.Open { return fmt.Errorf("cannot acknowledge closed alert") } st.NeedAck = false if err := s.DataAccess.Notifications().ClearNotifications(ak); err != nil { return err } case models.ActionClose: if st.IsActive() { return fmt.Errorf("cannot close active alert") } fallthrough case models.ActionForceClose: st.Open = false st.End = ×tamp case models.ActionForget: if !isUnknown { return fmt.Errorf("can only forget unknowns") } fallthrough case models.ActionPurge: return s.DataAccess.State().Forget(ak) default: return fmt.Errorf("unknown action type: %v", t) } // Would like to also track the alert group, but I believe this is impossible because any character // that could be used as a delimiter could also be a valid tag key or tag value character if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil { slog.Errorln(err) } st.Actions = append(st.Actions, models.Action{ Message: message, Time: timestamp, Type: t, User: user, }) _, err = s.DataAccess.State().UpdateIncidentState(st) return err }
func (d *dataAccess) SetUnevaluated(ak models.AlertKey, uneval bool) error { conn := d.Get() defer conn.Close() op := "SREM" if uneval { op = "SADD" } _, err := conn.Do(op, statesUnevalKey(ak.Name()), ak) return slog.Wrap(err) }
func (d *dataAccess) SetUnevaluated(ak models.AlertKey, uneval bool) error { defer collect.StartTimer("redis", opentsdb.TagSet{"op": "SetUnevaluated"})() conn := d.GetConnection() defer conn.Close() op := "SREM" if uneval { op = "SADD" } _, err := conn.Do(op, statesUnevalKey(ak.Name()), ak) return slog.Wrap(err) }
// The nucular option. Delete all we know about this alert key func (d *dataAccess) Forget(ak models.AlertKey) error { defer collect.StartTimer("redis", opentsdb.TagSet{"op": "Forget"})() conn := d.GetConnection() defer conn.Close() alert := ak.Name() return d.transact(conn, func() error { // last touched. if _, err := conn.Do("HDEL", statesLastTouchedKey(alert), ak); err != nil { return slog.Wrap(err) } // unknown/uneval sets if _, err := conn.Do("SREM", statesUnknownKey(alert), ak); err != nil { return slog.Wrap(err) } if _, err := conn.Do("SREM", statesUnevalKey(alert), ak); err != nil { return slog.Wrap(err) } //open set if _, err := conn.Do("HDEL", statesOpenIncidentsKey, ak); err != nil { return slog.Wrap(err) } //all incidents ids, err := int64s(conn.Do("LRANGE", incidentsForAlertKeyKey(ak), 0, -1)) if err != nil { return slog.Wrap(err) } if _, err = conn.Do("HDEL", statesOpenIncidentsKey, ak); err != nil { return slog.Wrap(err) } for _, id := range ids { if _, err = conn.Do("DEL", incidentStateKey(id)); err != nil { return slog.Wrap(err) } } if _, err := conn.Do(d.LCLEAR(), incidentsForAlertKeyKey(ak)); err != nil { return slog.Wrap(err) } return nil }) }
// RunHistory for a single alert key. Returns true if notifications were altered. func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.Event, silenced SilenceTester) (checkNotify bool, err error) { event.Time = r.Start a := s.Conf.Alerts[ak.Name()] if a.UnknownsNormal && event.Status == models.StUnknown { event.Status = models.StNormal } data := s.DataAccess.State() err = data.TouchAlertKey(ak, utcNow()) if err != nil { return } si := silenced(ak) // get existing open incident if exists var incident *models.IncidentState incident, err = data.GetOpenIncident(ak) if err != nil { return } defer func() { // save unless incident is new and closed (log alert) if incident != nil && (incident.Id != 0 || incident.Open) { _, err = data.UpdateIncidentState(incident) } else { err = data.SetUnevaluated(ak, event.Unevaluated) // if nothing to save, at least store the unevaluated state } }() // If nothing is out of the ordinary we are done if event.Status <= models.StNormal && incident == nil { return } // if event is unevaluated, we are done also. if incident != nil { incident.Unevaluated = event.Unevaluated } if event.Unevaluated { return } shouldNotify := false newIncident := false if incident == nil { incident = NewIncident(ak) newIncident = true shouldNotify = true } // VICTOROPS INTEGRATION: Enables notification of incidents which have returned to normal (Sends normNotification defined in config) if event.Status <= models.StNormal && (incident.CurrentStatus == models.StWarning || incident.CurrentStatus == models.StCritical) { slog.Infof("TRIGGER_RESOLVED: from %s to %s", incident.CurrentStatus, event.Status) shouldNotify = true } // VICTOROPS INTEGRATION: Enables notification of Incidents which have returned to normal but are now back to warning or critical. i.e. enable Flapping if incident.CurrentStatus == models.StNormal && (event.Status == models.StCritical || event.Status == models.StWarning) { slog.Infof("TRIGGER_REALERT: from %s to %s", incident.CurrentStatus, event.Status) shouldNotify = true } // set state.Result according to event result if event.Status == models.StCritical { incident.Result = event.Crit } else if event.Status == models.StWarning { incident.Result = event.Warn } if event.Status > models.StNormal { incident.LastAbnormalStatus = event.Status incident.LastAbnormalTime = event.Time.UTC().Unix() } if event.Status > incident.WorstStatus { incident.WorstStatus = event.Status shouldNotify = true } if event.Status != incident.CurrentStatus { incident.Events = append(incident.Events, *event) } incident.CurrentStatus = event.Status //run a preliminary save on new incidents to get an id if newIncident { if a.Log || silencedOrIgnored(a, event, si) { //a log or silenced/ignored alert will not need to be saved } else { incident.Id, err = s.DataAccess.State().UpdateIncidentState(incident) if err != nil { return } } } //render templates and open alert key if abnormal if event.Status > models.StNormal { s.executeTemplates(incident, event, a, r) incident.Open = true if a.Log { incident.Open = false } } // On state increase, clear old notifications and notify current. // Do nothing if state did not change. notify := func(ns *conf.Notifications) { if a.Log { lastLogTime := s.lastLogTimes[ak] now := utcNow() if now.Before(lastLogTime.Add(a.MaxLogFrequency)) { return } s.lastLogTimes[ak] = now } nots := ns.Get(s.Conf, incident.AlertKey.Group()) for _, n := range nots { s.Notify(incident, n) checkNotify = true } } notifyCurrent := func() { //Auto close ignoreUnknowns for new incident. if silencedOrIgnored(a, event, si) { incident.Open = false return } // VICTOROPS INTEGRATION incident.NeedAck = false switch event.Status { case models.StCritical, models.StUnknown: notify(a.CritNotification) case models.StWarning: notify(a.WarnNotification) case models.StNormal: // VICTOROPS INTEGRATION incident.NeedAck = false notify(a.NormNotification) } } // lock while we change notifications. s.Lock("RunHistory") if shouldNotify { incident.NeedAck = false if err = s.DataAccess.Notifications().ClearNotifications(ak); err != nil { return } notifyCurrent() } // finally close an open alert with silence once it goes back to normal. if si := silenced(ak); si != nil && event.Status == models.StNormal { go func(ak models.AlertKey) { slog.Infof("auto close %s because was silenced", ak) err := s.Action("bosun", "Auto close because was silenced.", models.ActionClose, ak) if err != nil { slog.Errorln(err) } }(ak) } s.Unlock() return checkNotify, nil }
func notsByAlertKeyKey(ak models.AlertKey) string { return fmt.Sprintf("notsByAlert:%s", ak.Name()) }
// RunHistory for a single alert key. Returns true if notifications were altered. func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *Event, silenced map[models.AlertKey]models.Silence) bool { checkNotify := false // get existing state object for alert key. add to schedule status if doesn't already exist state := s.GetStatus(ak) if state == nil { state = NewStatus(ak) s.SetStatus(ak, state) } defer s.SetStatus(ak, state) // make sure we always touch the state. state.Touched = r.Start // set state.Result according to event result if event.Crit != nil { state.Result = event.Crit } else if event.Warn != nil { state.Result = event.Warn } // if event is unevaluated, we are done. state.Unevaluated = event.Unevaluated if event.Unevaluated { return checkNotify } // assign incident id to new event if applicable prev := state.Last() worst := StNormal event.Time = r.Start if prev.IncidentId != 0 { // If last event has incident id and is not closed, we continue it. incident, err := s.DataAccess.Incidents().GetIncident(prev.IncidentId) if err != nil { slog.Error(err) } else if incident.End == nil { event.IncidentId = prev.IncidentId worst = state.WorstThisIncident() } } if event.IncidentId == 0 && event.Status != StNormal { incident, err := s.createIncident(ak, event.Time) if err != nil { slog.Error("Error creating incident", err) } else { event.IncidentId = incident.Id } } state.Append(event) a := s.Conf.Alerts[ak.Name()] // render templates and open alert key if abnormal if event.Status > StNormal { s.executeTemplates(state, event, a, r) state.Open = true if a.Log { worst = StNormal state.Open = false } } // On state increase, clear old notifications and notify current. // If the old alert was not acknowledged, do nothing. // Do nothing if state did not change. notify := func(ns *conf.Notifications) { if a.Log { lastLogTime := state.LastLogTime now := time.Now() if now.Before(lastLogTime.Add(a.MaxLogFrequency)) { return } state.LastLogTime = now } nots := ns.Get(s.Conf, state.Group) for _, n := range nots { s.Notify(state, n) checkNotify = true } } notifyCurrent := func() { // Auto close ignoreUnknowns. if a.IgnoreUnknown && event.Status == StUnknown { state.Open = false state.Forgotten = true state.NeedAck = false state.Action("bosun", "Auto close because alert has ignoreUnknown.", ActionClose, event.Time) slog.Infof("auto close %s because alert has ignoreUnknown", ak) return } else if silenced[ak].Forget && event.Status == StUnknown { state.Open = false state.Forgotten = true state.NeedAck = false state.Action("bosun", "Auto close because alert is silenced and marked auto forget.", ActionClose, event.Time) slog.Infof("auto close %s because alert is silenced and marked auto forget", ak) return } state.NeedAck = true switch event.Status { case StCritical, StUnknown: notify(a.CritNotification) case StWarning: notify(a.WarnNotification) } } clearOld := func() { state.NeedAck = false delete(s.Notifications, ak) } // lock while we change notifications. s.Lock("RunHistory") if event.Status > worst { clearOld() notifyCurrent() } else if _, ok := silenced[ak]; ok && event.Status == StNormal { go func(ak models.AlertKey) { slog.Infof("auto close %s because was silenced", ak) err := s.Action("bosun", "Auto close because was silenced.", ActionClose, ak) if err != nil { slog.Errorln(err) } }(ak) } s.Unlock() return checkNotify }
// RunHistory for a single alert key. Returns true if notifications were altered. func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.Event, silenced SilenceTester) (checkNotify bool, err error) { event.Time = r.Start data := s.DataAccess.State() err = data.TouchAlertKey(ak, time.Now()) if err != nil { return } // get existing open incident if exists incident, err := data.GetOpenIncident(ak) if err != nil { return } defer func() { // save unless incident is new and closed (log alert) if incident != nil && (incident.Id != 0 || incident.Open) { err = data.UpdateIncidentState(incident) } else { err = data.SetUnevaluated(ak, event.Unevaluated) // if nothing to save, at least store the unevaluated state } }() // If nothing is out of the ordinary we are done if event.Status <= models.StNormal && incident == nil { return } // if event is unevaluated, we are done also. if incident != nil { incident.Unevaluated = event.Unevaluated } if event.Unevaluated { return } shouldNotify := false if incident == nil { incident = NewIncident(ak) shouldNotify = true } // set state.Result according to event result if event.Status == models.StCritical { incident.Result = event.Crit } else if event.Status == models.StWarning { incident.Result = event.Warn } if event.Status > models.StNormal { incident.LastAbnormalStatus = event.Status incident.LastAbnormalTime = event.Time.UTC().Unix() } if event.Status > incident.WorstStatus { incident.WorstStatus = event.Status shouldNotify = true } if event.Status != incident.CurrentStatus { incident.Events = append(incident.Events, *event) } incident.CurrentStatus = event.Status a := s.Conf.Alerts[ak.Name()] //render templates and open alert key if abnormal if event.Status > models.StNormal { s.executeTemplates(incident, event, a, r) incident.Open = true if a.Log { incident.Open = false } } // On state increase, clear old notifications and notify current. // Do nothing if state did not change. notify := func(ns *conf.Notifications) { if a.Log { lastLogTime := s.lastLogTimes[ak] now := time.Now() if now.Before(lastLogTime.Add(a.MaxLogFrequency)) { return } s.lastLogTimes[ak] = now } nots := ns.Get(s.Conf, incident.AlertKey.Group()) for _, n := range nots { s.Notify(incident, n) checkNotify = true } } notifyCurrent := func() { si := silenced(ak) //Auto close ignoreUnknowns for new incident. if a.IgnoreUnknown && event.Status == models.StUnknown { incident.Open = false return } else if si != nil && si.Forget && event.Status == models.StUnknown { incident.Open = false return } incident.NeedAck = true switch event.Status { case models.StCritical, models.StUnknown: notify(a.CritNotification) case models.StWarning: notify(a.WarnNotification) } } clearOld := func() { incident.NeedAck = false delete(s.Notifications, ak) } // lock while we change notifications. s.Lock("RunHistory") if shouldNotify { clearOld() notifyCurrent() } // finally close an open alert with silence once it goes back to normal. if si := silenced(ak); si != nil && event.Status == models.StNormal { go func(ak models.AlertKey) { slog.Infof("auto close %s because was silenced", ak) err := s.Action("bosun", "Auto close because was silenced.", models.ActionClose, ak) if err != nil { slog.Errorln(err) } }(ak) } s.Unlock() return checkNotify, nil }
func (s *Schedule) Action(user, message string, t ActionType, ak models.AlertKey) error { s.Lock("Action") defer s.Unlock() st := s.status[ak] if st == nil { return fmt.Errorf("no such alert key: %v", ak) } ack := func() { delete(s.Notifications, ak) st.NeedAck = false } isUnknown := st.AbnormalStatus() == StUnknown timestamp := time.Now().UTC() switch t { case ActionAcknowledge: if !st.NeedAck { return fmt.Errorf("alert already acknowledged") } if !st.Open { return fmt.Errorf("cannot acknowledge closed alert") } ack() case ActionClose: if st.NeedAck { ack() } if st.IsActive() { return fmt.Errorf("cannot close active alert") } st.Open = false last := st.Last() if last.IncidentId != 0 { incident, err := s.DataAccess.Incidents().GetIncident(last.IncidentId) if err != nil { return err } incident.End = ×tamp if err = s.DataAccess.Incidents().UpdateIncident(last.IncidentId, incident); err != nil { return err } } case ActionForget: if !isUnknown { return fmt.Errorf("can only forget unknowns") } if st.NeedAck { ack() } st.Open = false st.Forgotten = true delete(s.status, ak) default: return fmt.Errorf("unknown action type: %v", t) } st.Action(user, message, t, timestamp) // Would like to also track the alert group, but I believe this is impossible because any character // that could be used as a delimiter could also be a valid tag key or tag value character if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil { slog.Errorln(err) } return nil }