Exemple #1
0
func (d *dataAccess) TouchAlertKey(ak models.AlertKey, t time.Time) error {
	conn := d.Get()
	defer conn.Close()

	_, err := conn.Do("ZADD", statesLastTouchedKey(ak.Name()), t.UTC().Unix(), string(ak))
	return slog.Wrap(err)
}
Exemple #2
0
func NewStatus(ak models.AlertKey) *State {
	g := ak.Group()
	return &State{
		Alert: ak.Name(),
		Tags:  g.Tags(),
		Group: g,
	}
}
Exemple #3
0
func (d *dataAccess) TouchAlertKey(ak models.AlertKey, t time.Time) error {
	defer collect.StartTimer("redis", opentsdb.TagSet{"op": "TouchAlertKey"})()
	conn := d.GetConnection()
	defer conn.Close()

	_, err := conn.Do("ZADD", statesLastTouchedKey(ak.Name()), t.UTC().Unix(), string(ak))
	return slog.Wrap(err)
}
Exemple #4
0
func NewIncident(ak models.AlertKey) *models.IncidentState {
	s := &models.IncidentState{}
	s.Start = utcNow()
	s.AlertKey = ak
	s.Alert = ak.Name()
	s.Tags = ak.Group().Tags()
	s.Result = &models.Result{}
	return s
}
Exemple #5
0
func (s *Schedule) Action(user, message string, t models.ActionType, ak models.AlertKey) error {
	if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil {
		slog.Errorln(err)
	}
	st, err := s.DataAccess.State().GetLatestIncident(ak)
	if err != nil {
		return err
	}
	if st == nil {
		return fmt.Errorf("no such alert key: %v", ak)
	}
	isUnknown := st.LastAbnormalStatus == models.StUnknown
	timestamp := utcNow()
	switch t {
	case models.ActionAcknowledge:
		if !st.NeedAck {
			return fmt.Errorf("alert already acknowledged")
		}
		if !st.Open {
			return fmt.Errorf("cannot acknowledge closed alert")
		}
		st.NeedAck = false
		if err := s.DataAccess.Notifications().ClearNotifications(ak); err != nil {
			return err
		}
	case models.ActionClose:
		if st.IsActive() {
			return fmt.Errorf("cannot close active alert")
		}
		fallthrough
	case models.ActionForceClose:
		st.Open = false
		st.End = &timestamp
	case models.ActionForget:
		if !isUnknown {
			return fmt.Errorf("can only forget unknowns")
		}
		fallthrough
	case models.ActionPurge:
		return s.DataAccess.State().Forget(ak)
	default:
		return fmt.Errorf("unknown action type: %v", t)
	}
	// Would like to also track the alert group, but I believe this is impossible because any character
	// that could be used as a delimiter could also be a valid tag key or tag value character
	if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil {
		slog.Errorln(err)
	}
	st.Actions = append(st.Actions, models.Action{
		Message: message,
		Time:    timestamp,
		Type:    t,
		User:    user,
	})
	_, err = s.DataAccess.State().UpdateIncidentState(st)
	return err
}
Exemple #6
0
func (d *dataAccess) SetUnevaluated(ak models.AlertKey, uneval bool) error {
	conn := d.Get()
	defer conn.Close()

	op := "SREM"
	if uneval {
		op = "SADD"
	}
	_, err := conn.Do(op, statesUnevalKey(ak.Name()), ak)
	return slog.Wrap(err)
}
Exemple #7
0
func (d *dataAccess) SetUnevaluated(ak models.AlertKey, uneval bool) error {
	defer collect.StartTimer("redis", opentsdb.TagSet{"op": "SetUnevaluated"})()
	conn := d.GetConnection()
	defer conn.Close()

	op := "SREM"
	if uneval {
		op = "SADD"
	}
	_, err := conn.Do(op, statesUnevalKey(ak.Name()), ak)
	return slog.Wrap(err)
}
Exemple #8
0
// The nucular option. Delete all we know about this alert key
func (d *dataAccess) Forget(ak models.AlertKey) error {
	defer collect.StartTimer("redis", opentsdb.TagSet{"op": "Forget"})()
	conn := d.GetConnection()
	defer conn.Close()

	alert := ak.Name()
	return d.transact(conn, func() error {
		// last touched.
		if _, err := conn.Do("HDEL", statesLastTouchedKey(alert), ak); err != nil {
			return slog.Wrap(err)
		}
		// unknown/uneval sets
		if _, err := conn.Do("SREM", statesUnknownKey(alert), ak); err != nil {
			return slog.Wrap(err)
		}
		if _, err := conn.Do("SREM", statesUnevalKey(alert), ak); err != nil {
			return slog.Wrap(err)
		}
		//open set
		if _, err := conn.Do("HDEL", statesOpenIncidentsKey, ak); err != nil {
			return slog.Wrap(err)
		}
		//all incidents
		ids, err := int64s(conn.Do("LRANGE", incidentsForAlertKeyKey(ak), 0, -1))
		if err != nil {
			return slog.Wrap(err)
		}
		if _, err = conn.Do("HDEL", statesOpenIncidentsKey, ak); err != nil {
			return slog.Wrap(err)
		}
		for _, id := range ids {

			if _, err = conn.Do("DEL", incidentStateKey(id)); err != nil {
				return slog.Wrap(err)
			}
		}
		if _, err := conn.Do(d.LCLEAR(), incidentsForAlertKeyKey(ak)); err != nil {
			return slog.Wrap(err)
		}
		return nil
	})
}
Exemple #9
0
// RunHistory for a single alert key. Returns true if notifications were altered.
func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.Event, silenced SilenceTester) (checkNotify bool, err error) {
	event.Time = r.Start
	a := s.Conf.Alerts[ak.Name()]
	if a.UnknownsNormal && event.Status == models.StUnknown {
		event.Status = models.StNormal
	}

	data := s.DataAccess.State()
	err = data.TouchAlertKey(ak, utcNow())
	if err != nil {
		return
	}

	si := silenced(ak)

	// get existing open incident if exists
	var incident *models.IncidentState
	incident, err = data.GetOpenIncident(ak)
	if err != nil {
		return
	}
	defer func() {
		// save unless incident is new and closed (log alert)
		if incident != nil && (incident.Id != 0 || incident.Open) {
			_, err = data.UpdateIncidentState(incident)
		} else {
			err = data.SetUnevaluated(ak, event.Unevaluated) // if nothing to save, at least store the unevaluated state
		}
	}()
	// If nothing is out of the ordinary we are done
	if event.Status <= models.StNormal && incident == nil {
		return
	}

	// if event is unevaluated, we are done also.
	if incident != nil {
		incident.Unevaluated = event.Unevaluated
	}
	if event.Unevaluated {
		return
	}

	shouldNotify := false
	newIncident := false
	if incident == nil {
		incident = NewIncident(ak)
		newIncident = true
		shouldNotify = true
	}

	// VICTOROPS INTEGRATION: Enables notification of incidents which have returned to normal (Sends normNotification defined in config)
	if event.Status <= models.StNormal && (incident.CurrentStatus == models.StWarning || incident.CurrentStatus == models.StCritical) {
		slog.Infof("TRIGGER_RESOLVED: from %s to %s", incident.CurrentStatus, event.Status)
		shouldNotify = true
	}

	// VICTOROPS INTEGRATION:  Enables notification of Incidents which have returned to normal but are now back to warning or critical. i.e. enable Flapping
	if incident.CurrentStatus == models.StNormal && (event.Status == models.StCritical || event.Status == models.StWarning) {
		slog.Infof("TRIGGER_REALERT: from %s to %s", incident.CurrentStatus, event.Status)
		shouldNotify = true
	}

	// set state.Result according to event result
	if event.Status == models.StCritical {
		incident.Result = event.Crit
	} else if event.Status == models.StWarning {
		incident.Result = event.Warn
	}

	if event.Status > models.StNormal {
		incident.LastAbnormalStatus = event.Status
		incident.LastAbnormalTime = event.Time.UTC().Unix()
	}
	if event.Status > incident.WorstStatus {
		incident.WorstStatus = event.Status
		shouldNotify = true
	}
	if event.Status != incident.CurrentStatus {
		incident.Events = append(incident.Events, *event)
	}
	incident.CurrentStatus = event.Status

	//run a preliminary save on new incidents to get an id
	if newIncident {
		if a.Log || silencedOrIgnored(a, event, si) {
			//a log or silenced/ignored alert will not need to be saved
		} else {
			incident.Id, err = s.DataAccess.State().UpdateIncidentState(incident)
			if err != nil {
				return
			}
		}
	}

	//render templates and open alert key if abnormal
	if event.Status > models.StNormal {
		s.executeTemplates(incident, event, a, r)
		incident.Open = true
		if a.Log {
			incident.Open = false
		}
	}

	// On state increase, clear old notifications and notify current.
	// Do nothing if state did not change.
	notify := func(ns *conf.Notifications) {
		if a.Log {
			lastLogTime := s.lastLogTimes[ak]
			now := utcNow()
			if now.Before(lastLogTime.Add(a.MaxLogFrequency)) {
				return
			}
			s.lastLogTimes[ak] = now
		}
		nots := ns.Get(s.Conf, incident.AlertKey.Group())
		for _, n := range nots {
			s.Notify(incident, n)
			checkNotify = true
		}
	}

	notifyCurrent := func() {
		//Auto close ignoreUnknowns for new incident.
		if silencedOrIgnored(a, event, si) {
			incident.Open = false
			return
		}
		// VICTOROPS INTEGRATION
		incident.NeedAck = false
		switch event.Status {
		case models.StCritical, models.StUnknown:
			notify(a.CritNotification)
		case models.StWarning:
			notify(a.WarnNotification)
		case models.StNormal:
			// VICTOROPS INTEGRATION
			incident.NeedAck = false
			notify(a.NormNotification)
		}
	}

	// lock while we change notifications.
	s.Lock("RunHistory")
	if shouldNotify {
		incident.NeedAck = false
		if err = s.DataAccess.Notifications().ClearNotifications(ak); err != nil {
			return
		}
		notifyCurrent()
	}

	// finally close an open alert with silence once it goes back to normal.
	if si := silenced(ak); si != nil && event.Status == models.StNormal {
		go func(ak models.AlertKey) {
			slog.Infof("auto close %s because was silenced", ak)
			err := s.Action("bosun", "Auto close because was silenced.", models.ActionClose, ak)
			if err != nil {
				slog.Errorln(err)
			}
		}(ak)
	}
	s.Unlock()
	return checkNotify, nil
}
Exemple #10
0
func notsByAlertKeyKey(ak models.AlertKey) string {
	return fmt.Sprintf("notsByAlert:%s", ak.Name())
}
Exemple #11
0
// RunHistory for a single alert key. Returns true if notifications were altered.
func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *Event, silenced map[models.AlertKey]models.Silence) bool {
	checkNotify := false
	// get existing state object for alert key. add to schedule status if doesn't already exist
	state := s.GetStatus(ak)
	if state == nil {
		state = NewStatus(ak)
		s.SetStatus(ak, state)
	}
	defer s.SetStatus(ak, state)
	// make sure we always touch the state.
	state.Touched = r.Start
	// set state.Result according to event result
	if event.Crit != nil {
		state.Result = event.Crit
	} else if event.Warn != nil {
		state.Result = event.Warn
	}
	// if event is unevaluated, we are done.
	state.Unevaluated = event.Unevaluated
	if event.Unevaluated {
		return checkNotify
	}
	// assign incident id to new event if applicable
	prev := state.Last()
	worst := StNormal
	event.Time = r.Start
	if prev.IncidentId != 0 {
		// If last event has incident id and is not closed, we continue it.
		incident, err := s.DataAccess.Incidents().GetIncident(prev.IncidentId)
		if err != nil {
			slog.Error(err)
		} else if incident.End == nil {
			event.IncidentId = prev.IncidentId
			worst = state.WorstThisIncident()
		}
	}
	if event.IncidentId == 0 && event.Status != StNormal {
		incident, err := s.createIncident(ak, event.Time)
		if err != nil {
			slog.Error("Error creating incident", err)
		} else {
			event.IncidentId = incident.Id
		}
	}

	state.Append(event)
	a := s.Conf.Alerts[ak.Name()]
	// render templates and open alert key if abnormal
	if event.Status > StNormal {
		s.executeTemplates(state, event, a, r)
		state.Open = true
		if a.Log {
			worst = StNormal
			state.Open = false
		}
	}
	// On state increase, clear old notifications and notify current.
	// If the old alert was not acknowledged, do nothing.
	// Do nothing if state did not change.
	notify := func(ns *conf.Notifications) {
		if a.Log {
			lastLogTime := state.LastLogTime
			now := time.Now()
			if now.Before(lastLogTime.Add(a.MaxLogFrequency)) {
				return
			}
			state.LastLogTime = now
		}
		nots := ns.Get(s.Conf, state.Group)
		for _, n := range nots {
			s.Notify(state, n)
			checkNotify = true
		}
	}
	notifyCurrent := func() {
		// Auto close ignoreUnknowns.
		if a.IgnoreUnknown && event.Status == StUnknown {
			state.Open = false
			state.Forgotten = true
			state.NeedAck = false
			state.Action("bosun", "Auto close because alert has ignoreUnknown.", ActionClose, event.Time)
			slog.Infof("auto close %s because alert has ignoreUnknown", ak)
			return
		} else if silenced[ak].Forget && event.Status == StUnknown {
			state.Open = false
			state.Forgotten = true
			state.NeedAck = false
			state.Action("bosun", "Auto close because alert is silenced and marked auto forget.", ActionClose, event.Time)
			slog.Infof("auto close %s because alert is silenced and marked auto forget", ak)
			return
		}
		state.NeedAck = true
		switch event.Status {
		case StCritical, StUnknown:
			notify(a.CritNotification)
		case StWarning:
			notify(a.WarnNotification)
		}
	}
	clearOld := func() {
		state.NeedAck = false
		delete(s.Notifications, ak)
	}

	// lock while we change notifications.
	s.Lock("RunHistory")
	if event.Status > worst {
		clearOld()
		notifyCurrent()
	} else if _, ok := silenced[ak]; ok && event.Status == StNormal {
		go func(ak models.AlertKey) {
			slog.Infof("auto close %s because was silenced", ak)
			err := s.Action("bosun", "Auto close because was silenced.", ActionClose, ak)
			if err != nil {
				slog.Errorln(err)
			}
		}(ak)
	}

	s.Unlock()
	return checkNotify
}
Exemple #12
0
// RunHistory for a single alert key. Returns true if notifications were altered.
func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.Event, silenced SilenceTester) (checkNotify bool, err error) {
	event.Time = r.Start
	data := s.DataAccess.State()
	err = data.TouchAlertKey(ak, time.Now())
	if err != nil {
		return
	}
	// get existing open incident if exists
	incident, err := data.GetOpenIncident(ak)
	if err != nil {
		return
	}
	defer func() {
		// save unless incident is new and closed (log alert)
		if incident != nil && (incident.Id != 0 || incident.Open) {
			err = data.UpdateIncidentState(incident)
		} else {
			err = data.SetUnevaluated(ak, event.Unevaluated) // if nothing to save, at least store the unevaluated state
		}
	}()
	// If nothing is out of the ordinary we are done
	if event.Status <= models.StNormal && incident == nil {
		return
	}

	// if event is unevaluated, we are done also.
	if incident != nil {
		incident.Unevaluated = event.Unevaluated
	}
	if event.Unevaluated {
		return
	}

	shouldNotify := false
	if incident == nil {
		incident = NewIncident(ak)
		shouldNotify = true
	}
	// set state.Result according to event result
	if event.Status == models.StCritical {
		incident.Result = event.Crit
	} else if event.Status == models.StWarning {
		incident.Result = event.Warn
	}

	if event.Status > models.StNormal {
		incident.LastAbnormalStatus = event.Status
		incident.LastAbnormalTime = event.Time.UTC().Unix()
	}
	if event.Status > incident.WorstStatus {
		incident.WorstStatus = event.Status
		shouldNotify = true
	}
	if event.Status != incident.CurrentStatus {
		incident.Events = append(incident.Events, *event)
	}
	incident.CurrentStatus = event.Status

	a := s.Conf.Alerts[ak.Name()]
	//render templates and open alert key if abnormal
	if event.Status > models.StNormal {
		s.executeTemplates(incident, event, a, r)
		incident.Open = true
		if a.Log {
			incident.Open = false
		}
	}

	// On state increase, clear old notifications and notify current.
	// Do nothing if state did not change.
	notify := func(ns *conf.Notifications) {
		if a.Log {
			lastLogTime := s.lastLogTimes[ak]
			now := time.Now()
			if now.Before(lastLogTime.Add(a.MaxLogFrequency)) {
				return
			}
			s.lastLogTimes[ak] = now
		}
		nots := ns.Get(s.Conf, incident.AlertKey.Group())
		for _, n := range nots {
			s.Notify(incident, n)
			checkNotify = true
		}
	}

	notifyCurrent := func() {
		si := silenced(ak)
		//Auto close ignoreUnknowns for new incident.
		if a.IgnoreUnknown && event.Status == models.StUnknown {
			incident.Open = false
			return
		} else if si != nil && si.Forget && event.Status == models.StUnknown {
			incident.Open = false
			return
		}
		incident.NeedAck = true
		switch event.Status {
		case models.StCritical, models.StUnknown:
			notify(a.CritNotification)
		case models.StWarning:
			notify(a.WarnNotification)
		}
	}
	clearOld := func() {
		incident.NeedAck = false
		delete(s.Notifications, ak)
	}

	// lock while we change notifications.
	s.Lock("RunHistory")
	if shouldNotify {
		clearOld()
		notifyCurrent()
	}

	// finally close an open alert with silence once it goes back to normal.
	if si := silenced(ak); si != nil && event.Status == models.StNormal {
		go func(ak models.AlertKey) {
			slog.Infof("auto close %s because was silenced", ak)
			err := s.Action("bosun", "Auto close because was silenced.", models.ActionClose, ak)
			if err != nil {
				slog.Errorln(err)
			}
		}(ak)
	}
	s.Unlock()
	return checkNotify, nil
}
Exemple #13
0
func (s *Schedule) Action(user, message string, t ActionType, ak models.AlertKey) error {
	s.Lock("Action")
	defer s.Unlock()
	st := s.status[ak]
	if st == nil {
		return fmt.Errorf("no such alert key: %v", ak)
	}
	ack := func() {
		delete(s.Notifications, ak)
		st.NeedAck = false
	}
	isUnknown := st.AbnormalStatus() == StUnknown
	timestamp := time.Now().UTC()
	switch t {
	case ActionAcknowledge:
		if !st.NeedAck {
			return fmt.Errorf("alert already acknowledged")
		}
		if !st.Open {
			return fmt.Errorf("cannot acknowledge closed alert")
		}
		ack()
	case ActionClose:
		if st.NeedAck {
			ack()
		}
		if st.IsActive() {
			return fmt.Errorf("cannot close active alert")
		}
		st.Open = false
		last := st.Last()
		if last.IncidentId != 0 {
			incident, err := s.DataAccess.Incidents().GetIncident(last.IncidentId)
			if err != nil {
				return err
			}
			incident.End = &timestamp
			if err = s.DataAccess.Incidents().UpdateIncident(last.IncidentId, incident); err != nil {
				return err
			}

		}
	case ActionForget:
		if !isUnknown {
			return fmt.Errorf("can only forget unknowns")
		}
		if st.NeedAck {
			ack()
		}
		st.Open = false
		st.Forgotten = true
		delete(s.status, ak)
	default:
		return fmt.Errorf("unknown action type: %v", t)
	}
	st.Action(user, message, t, timestamp)
	// Would like to also track the alert group, but I believe this is impossible because any character
	// that could be used as a delimiter could also be a valid tag key or tag value character
	if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil {
		slog.Errorln(err)
	}
	return nil
}