Пример #1
0
func (s *Schedule) Action(user, message string, t ActionType, ak expr.AlertKey) error {
	s.Lock("Action")
	defer s.Unlock()
	st := s.status[ak]
	if st == nil {
		return fmt.Errorf("no such alert key: %v", ak)
	}
	ack := func() {
		delete(s.Notifications, ak)
		st.NeedAck = false
	}
	isUnknown := st.AbnormalStatus() == StUnknown
	isError := st.AbnormalStatus() == StError
	timestamp := time.Now().UTC()
	switch t {
	case ActionAcknowledge:
		if !st.NeedAck {
			return fmt.Errorf("alert already acknowledged")
		}
		if !st.Open {
			return fmt.Errorf("cannot acknowledge closed alert")
		}
		ack()
	case ActionClose:
		if st.NeedAck {
			ack()
		}
		if st.IsActive() && !isError {
			return fmt.Errorf("cannot close active alert")
		}
		st.Open = false
		last := st.Last()
		if last.IncidentId != 0 {
			s.incidentLock.Lock()
			if incident, ok := s.Incidents[last.IncidentId]; ok {
				incident.End = &timestamp
			}
			s.incidentLock.Unlock()
		}
	case ActionForget:
		if !isUnknown {
			return fmt.Errorf("can only forget unknowns")
		}
		if st.NeedAck {
			ack()
		}
		st.Open = false
		st.Forgotten = true
		delete(s.status, ak)
	default:
		return fmt.Errorf("unknown action type: %v", t)
	}
	st.Action(user, message, t, timestamp)
	// Would like to also track the alert group, but I believe this is impossible because any character
	// that could be used as a delimiter could also be a valid tag key or tag value character
	if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil {
		log.Println(err)
	}
	return nil
}
Пример #2
0
func NewStatus(ak expr.AlertKey) *State {
	g := ak.Group()
	return &State{
		Alert: ak.Name(),
		Tags:  g.Tags(),
		Group: g,
	}
}
Пример #3
0
// RunHistory for a single alert key. Returns true if notifications were altered.
func (s *Schedule) runHistory(r *RunHistory, ak expr.AlertKey, event *Event, silenced map[expr.AlertKey]Silence) bool {
	checkNotify := false
	// get existing state object for alert key. add to schedule status if doesn't already exist
	state := s.GetStatus(ak)
	if state == nil {
		state = NewStatus(ak)
		s.SetStatus(ak, state)
	}
	defer s.SetStatus(ak, state)
	// make sure we always touch the state.
	state.Touched = r.Start
	// set state.Result according to event result
	if event.Error != nil {
		state.Result = event.Error
	} else if event.Crit != nil {
		state.Result = event.Crit
	} else if event.Warn != nil {
		state.Result = event.Warn
	}
	// if event is unevaluated, we are done.
	state.Unevaluated = event.Unevaluated
	if event.Unevaluated {
		return checkNotify
	}
	// assign incident id to new event if applicable
	prev := state.Last()
	event.Time = r.Start
	if prev.IncidentId != 0 {
		// If last event has incident id and is not closed, we continue it.
		s.incidentLock.Lock()
		if incident, ok := s.Incidents[prev.IncidentId]; ok && incident.End == nil {
			event.IncidentId = prev.IncidentId
		}
		s.incidentLock.Unlock()
	}
	if event.IncidentId == 0 && event.Status != StNormal {
		// Otherwise, create new incident on first non-normal event.
		event.IncidentId = s.createIncident(ak, event.Time).Id
	}
	// add new event to state
	last := state.AbnormalStatus()
	state.Append(event)
	a := s.Conf.Alerts[ak.Name()]
	wasOpen := state.Open
	// render templates and open alert key if abnormal
	if event.Status > StNormal {
		s.executeTemplates(state, event, a, r)
		state.Open = true
		if a.Log {
			state.Open = false
		}
	}
	// On state increase, clear old notifications and notify current.
	// On state decrease, and if the old alert was already acknowledged, notify current.
	// If the old alert was not acknowledged, do nothing.
	// Do nothing if state did not change.
	notify := func(ns *conf.Notifications) {
		if a.Log {
			lastLogTime := state.LastLogTime
			now := time.Now()
			if now.Before(lastLogTime.Add(a.MaxLogFrequency)) {
				return
			}
			state.LastLogTime = now
		}
		nots := ns.Get(s.Conf, state.Group)
		for _, n := range nots {
			s.Notify(state, n)
			checkNotify = true
		}
	}
	notifyCurrent := func() {
		// Auto close ignoreUnknowns.
		if a.IgnoreUnknown && event.Status == StUnknown {
			state.Open = false
			state.Forgotten = true
			state.NeedAck = false
			state.Action("bosun", "Auto close because alert has ignoreUnknown.", ActionClose, event.Time)
			slog.Infof("auto close %s because alert has ignoreUnknown", ak)
			return
		} else if silenced[ak].Forget && event.Status == StUnknown {
			state.Open = false
			state.Forgotten = true
			state.NeedAck = false
			state.Action("bosun", "Auto close because alert is silenced and marked auto forget.", ActionClose, event.Time)
			slog.Infof("auto close %s because alert is silenced and marked auto forget", ak)
			return
		}
		state.NeedAck = true
		switch event.Status {
		case StCritical, StUnknown:
			notify(a.CritNotification)
		case StWarning:
			notify(a.WarnNotification)
		}
	}
	clearOld := func() {
		state.NeedAck = false
		delete(s.Notifications, ak)
	}
	// lock while we change notifications.
	s.Lock("RunHistory")
	// last could be StNone if it is new. Set it to normal if so because StNormal >
	// StNone. If the state is not open (closed), then the last state we care about
	// isn't the last abnormal state, it's just normal.
	if last < StNormal || !wasOpen {
		last = StNormal
	}
	if event.Status > last {
		clearOld()
		notifyCurrent()
	} else if event.Status < last {
		if _, hasOld := s.Notifications[ak]; hasOld {
			notifyCurrent()
		}
		// Auto close silenced alerts.
		if _, ok := silenced[ak]; ok && event.Status == StNormal {
			go func(ak expr.AlertKey) {
				slog.Infof("auto close %s because was silenced", ak)
				err := s.Action("bosun", "Auto close because was silenced.", ActionClose, ak)
				if err != nil {
					slog.Errorln(err)
				}
			}(ak)
		}
	}
	s.Unlock()
	return checkNotify
}