func (s *Schedule) Action(user, message string, t ActionType, ak expr.AlertKey) error { s.Lock("Action") defer s.Unlock() st := s.status[ak] if st == nil { return fmt.Errorf("no such alert key: %v", ak) } ack := func() { delete(s.Notifications, ak) st.NeedAck = false } isUnknown := st.AbnormalStatus() == StUnknown isError := st.AbnormalStatus() == StError timestamp := time.Now().UTC() switch t { case ActionAcknowledge: if !st.NeedAck { return fmt.Errorf("alert already acknowledged") } if !st.Open { return fmt.Errorf("cannot acknowledge closed alert") } ack() case ActionClose: if st.NeedAck { ack() } if st.IsActive() && !isError { return fmt.Errorf("cannot close active alert") } st.Open = false last := st.Last() if last.IncidentId != 0 { s.incidentLock.Lock() if incident, ok := s.Incidents[last.IncidentId]; ok { incident.End = ×tamp } s.incidentLock.Unlock() } case ActionForget: if !isUnknown { return fmt.Errorf("can only forget unknowns") } if st.NeedAck { ack() } st.Open = false st.Forgotten = true delete(s.status, ak) default: return fmt.Errorf("unknown action type: %v", t) } st.Action(user, message, t, timestamp) // Would like to also track the alert group, but I believe this is impossible because any character // that could be used as a delimiter could also be a valid tag key or tag value character if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil { log.Println(err) } return nil }
func NewStatus(ak expr.AlertKey) *State { g := ak.Group() return &State{ Alert: ak.Name(), Tags: g.Tags(), Group: g, } }
// RunHistory for a single alert key. Returns true if notifications were altered. func (s *Schedule) runHistory(r *RunHistory, ak expr.AlertKey, event *Event, silenced map[expr.AlertKey]Silence) bool { checkNotify := false // get existing state object for alert key. add to schedule status if doesn't already exist state := s.GetStatus(ak) if state == nil { state = NewStatus(ak) s.SetStatus(ak, state) } defer s.SetStatus(ak, state) // make sure we always touch the state. state.Touched = r.Start // set state.Result according to event result if event.Error != nil { state.Result = event.Error } else if event.Crit != nil { state.Result = event.Crit } else if event.Warn != nil { state.Result = event.Warn } // if event is unevaluated, we are done. state.Unevaluated = event.Unevaluated if event.Unevaluated { return checkNotify } // assign incident id to new event if applicable prev := state.Last() event.Time = r.Start if prev.IncidentId != 0 { // If last event has incident id and is not closed, we continue it. s.incidentLock.Lock() if incident, ok := s.Incidents[prev.IncidentId]; ok && incident.End == nil { event.IncidentId = prev.IncidentId } s.incidentLock.Unlock() } if event.IncidentId == 0 && event.Status != StNormal { // Otherwise, create new incident on first non-normal event. event.IncidentId = s.createIncident(ak, event.Time).Id } // add new event to state last := state.AbnormalStatus() state.Append(event) a := s.Conf.Alerts[ak.Name()] wasOpen := state.Open // render templates and open alert key if abnormal if event.Status > StNormal { s.executeTemplates(state, event, a, r) state.Open = true if a.Log { state.Open = false } } // On state increase, clear old notifications and notify current. // On state decrease, and if the old alert was already acknowledged, notify current. // If the old alert was not acknowledged, do nothing. // Do nothing if state did not change. notify := func(ns *conf.Notifications) { if a.Log { lastLogTime := state.LastLogTime now := time.Now() if now.Before(lastLogTime.Add(a.MaxLogFrequency)) { return } state.LastLogTime = now } nots := ns.Get(s.Conf, state.Group) for _, n := range nots { s.Notify(state, n) checkNotify = true } } notifyCurrent := func() { // Auto close ignoreUnknowns. if a.IgnoreUnknown && event.Status == StUnknown { state.Open = false state.Forgotten = true state.NeedAck = false state.Action("bosun", "Auto close because alert has ignoreUnknown.", ActionClose, event.Time) slog.Infof("auto close %s because alert has ignoreUnknown", ak) return } else if silenced[ak].Forget && event.Status == StUnknown { state.Open = false state.Forgotten = true state.NeedAck = false state.Action("bosun", "Auto close because alert is silenced and marked auto forget.", ActionClose, event.Time) slog.Infof("auto close %s because alert is silenced and marked auto forget", ak) return } state.NeedAck = true switch event.Status { case StCritical, StUnknown: notify(a.CritNotification) case StWarning: notify(a.WarnNotification) } } clearOld := func() { state.NeedAck = false delete(s.Notifications, ak) } // lock while we change notifications. s.Lock("RunHistory") // last could be StNone if it is new. Set it to normal if so because StNormal > // StNone. If the state is not open (closed), then the last state we care about // isn't the last abnormal state, it's just normal. if last < StNormal || !wasOpen { last = StNormal } if event.Status > last { clearOld() notifyCurrent() } else if event.Status < last { if _, hasOld := s.Notifications[ak]; hasOld { notifyCurrent() } // Auto close silenced alerts. if _, ok := silenced[ak]; ok && event.Status == StNormal { go func(ak expr.AlertKey) { slog.Infof("auto close %s because was silenced", ak) err := s.Action("bosun", "Auto close because was silenced.", ActionClose, ak) if err != nil { slog.Errorln(err) } }(ak) } } s.Unlock() return checkNotify }