Exemple #1
0
func (s *Schedule) notify(st *models.IncidentState, n *conf.Notification) {
	if len(st.EmailSubject) == 0 {
		st.EmailSubject = []byte(st.Subject)
	}
	if len(st.EmailBody) == 0 {
		st.EmailBody = []byte(st.Body)
	}
	n.Notify(st.Subject, st.Body, st.EmailSubject, st.EmailBody, s.SystemConf, string(st.AlertKey), st.Attachments...)
}
Exemple #2
0
func Status(t miniprofiler.Timer, w http.ResponseWriter, r *http.Request) (interface{}, error) {
	r.ParseForm()
	type ExtStatus struct {
		AlertName string
		*models.IncidentState
	}
	m := make(map[string]ExtStatus)
	for _, k := range r.Form["ak"] {
		ak, err := models.ParseAlertKey(k)
		if err != nil {
			return nil, err
		}
		var state *models.IncidentState
		if r.FormValue("all") != "" {
			allInc, err := schedule.DataAccess.State().GetAllIncidents(ak)
			if err != nil {
				return nil, err
			}
			if len(allInc) == 0 {
				return nil, fmt.Errorf("No incidents for alert key")
			}
			state = allInc[0]
			allEvents := models.EventsByTime{}
			for _, inc := range allInc {
				for _, e := range inc.Events {
					allEvents = append(allEvents, e)
				}
			}
			sort.Sort(allEvents)
			state.Events = allEvents
		} else {
			state, err = schedule.DataAccess.State().GetLatestIncident(ak)
			if err != nil {
				return nil, err
			}
		}
		st := ExtStatus{IncidentState: state}
		if st.IncidentState == nil {
			return nil, fmt.Errorf("unknown alert key: %v", k)
		}
		st.AlertName = ak.Name()
		m[k] = st
	}
	return m, nil
}
Exemple #3
0
func (s *Schedule) action(user, message string, t models.ActionType, st *models.IncidentState) (ak models.AlertKey, e error) {
	if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": st.AlertKey.Name(), "type": t.String()}, 1); err != nil {
		slog.Errorln(err)
	}
	defer func() {
		if e == nil {
			if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": st.AlertKey.Name(), "type": t.String()}, 1); err != nil {
				slog.Errorln(err)
			}
			if err := s.DataAccess.Notifications().ClearNotifications(st.AlertKey); err != nil {
				e = err
			}
		}
	}()
	isUnknown := st.LastAbnormalStatus == models.StUnknown
	timestamp := utcNow()
	switch t {
	case models.ActionAcknowledge:
		if !st.NeedAck {
			return "", fmt.Errorf("alert already acknowledged")
		}
		if !st.Open {
			return "", fmt.Errorf("cannot acknowledge closed alert")
		}
		st.NeedAck = false
	case models.ActionClose:
		if st.IsActive() {
			return "", fmt.Errorf("cannot close active alert")
		}
		fallthrough
	case models.ActionForceClose:
		st.Open = false
		st.End = &timestamp
	case models.ActionForget:
		if !isUnknown {
			return "", fmt.Errorf("can only forget unknowns")
		}
		fallthrough
	case models.ActionPurge:
		return st.AlertKey, s.DataAccess.State().Forget(st.AlertKey)
	case models.ActionNote:
		// pass
	default:
		return "", fmt.Errorf("unknown action type: %v", t)
	}
	st.Actions = append(st.Actions, models.Action{
		Message: message,
		Time:    timestamp,
		Type:    t,
		User:    user,
	})
	_, err := s.DataAccess.State().UpdateIncidentState(st)
	return st.AlertKey, err
}
Exemple #4
0
func (s *Schedule) executeTemplates(state *models.IncidentState, event *models.Event, a *conf.Alert, r *RunHistory) {
	if event.Status != models.StUnknown {
		var errs []error
		metric := "template.render"
		//Render subject
		endTiming := collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "subject"})
		subject, err := s.ExecuteSubject(r, a, state, false)
		if err != nil {
			slog.Infof("%s: %v", state.AlertKey, err)
			errs = append(errs, err)
		} else if subject == nil {
			err = fmt.Errorf("Empty subject on %s", state.AlertKey)
			slog.Error(err)
			errs = append(errs, err)
		}
		endTiming()

		//Render body
		endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "body"})
		body, _, err := s.ExecuteBody(r, a, state, false)
		if err != nil {
			slog.Infof("%s: %v", state.AlertKey, err)
			errs = append(errs, err)
		} else if subject == nil {
			err = fmt.Errorf("Empty body on %s", state.AlertKey)
			slog.Error(err)
			errs = append(errs, err)
		}
		endTiming()

		//Render email body
		endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "emailbody"})
		emailbody, attachments, err := s.ExecuteBody(r, a, state, true)
		if err != nil {
			slog.Infof("%s: %v", state.AlertKey, err)
			errs = append(errs, err)
		} else if subject == nil {
			err = fmt.Errorf("Empty email body on %s", state.AlertKey)
			slog.Error(err)
			errs = append(errs, err)
		}
		endTiming()

		//Render email subject
		endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "emailsubject"})
		emailsubject, err := s.ExecuteSubject(r, a, state, true)
		if err != nil {
			slog.Infof("%s: %v", state.AlertKey, err)
			errs = append(errs, err)
		} else if subject == nil {
			err = fmt.Errorf("Empty email subject on %s", state.AlertKey)
			slog.Error(err)
			errs = append(errs, err)
		}
		endTiming()

		if errs != nil {
			endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "bad"})
			subject, body, err = s.ExecuteBadTemplate(errs, r, a, state)
			endTiming()

			if err != nil {
				subject = []byte(fmt.Sprintf("unable to create template error notification: %v", err))
			}
			emailbody = body
			attachments = nil
		}
		state.Subject = string(subject)
		state.Body = string(body)
		//don't save email seperately if they are identical
		if string(state.EmailBody) != state.Body {
			state.EmailBody = emailbody
		}
		if string(state.EmailSubject) != state.Subject {
			state.EmailSubject = emailsubject
		}
		state.Attachments = attachments
	}
}
Exemple #5
0
// RunHistory for a single alert key. Returns true if notifications were altered.
func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.Event, silenced SilenceTester) (checkNotify bool, err error) {
	event.Time = r.Start
	a := s.Conf.Alerts[ak.Name()]
	if a.UnknownsNormal && event.Status == models.StUnknown {
		event.Status = models.StNormal
	}

	data := s.DataAccess.State()
	err = data.TouchAlertKey(ak, utcNow())
	if err != nil {
		return
	}

	si := silenced(ak)

	// get existing open incident if exists
	var incident *models.IncidentState
	incident, err = data.GetOpenIncident(ak)
	if err != nil {
		return
	}
	defer func() {
		// save unless incident is new and closed (log alert)
		if incident != nil && (incident.Id != 0 || incident.Open) {
			_, err = data.UpdateIncidentState(incident)
		} else {
			err = data.SetUnevaluated(ak, event.Unevaluated) // if nothing to save, at least store the unevaluated state
		}
	}()
	// If nothing is out of the ordinary we are done
	if event.Status <= models.StNormal && incident == nil {
		return
	}

	// if event is unevaluated, we are done also.
	if incident != nil {
		incident.Unevaluated = event.Unevaluated
	}
	if event.Unevaluated {
		return
	}

	shouldNotify := false
	newIncident := false
	if incident == nil {
		incident = NewIncident(ak)
		newIncident = true
		shouldNotify = true
	}

	// VICTOROPS INTEGRATION: Enables notification of incidents which have returned to normal (Sends normNotification defined in config)
	if event.Status <= models.StNormal && (incident.CurrentStatus == models.StWarning || incident.CurrentStatus == models.StCritical) {
		slog.Infof("TRIGGER_RESOLVED: from %s to %s", incident.CurrentStatus, event.Status)
		shouldNotify = true
	}

	// VICTOROPS INTEGRATION:  Enables notification of Incidents which have returned to normal but are now back to warning or critical. i.e. enable Flapping
	if incident.CurrentStatus == models.StNormal && (event.Status == models.StCritical || event.Status == models.StWarning) {
		slog.Infof("TRIGGER_REALERT: from %s to %s", incident.CurrentStatus, event.Status)
		shouldNotify = true
	}

	// set state.Result according to event result
	if event.Status == models.StCritical {
		incident.Result = event.Crit
	} else if event.Status == models.StWarning {
		incident.Result = event.Warn
	}

	if event.Status > models.StNormal {
		incident.LastAbnormalStatus = event.Status
		incident.LastAbnormalTime = event.Time.UTC().Unix()
	}
	if event.Status > incident.WorstStatus {
		incident.WorstStatus = event.Status
		shouldNotify = true
	}
	if event.Status != incident.CurrentStatus {
		incident.Events = append(incident.Events, *event)
	}
	incident.CurrentStatus = event.Status

	//run a preliminary save on new incidents to get an id
	if newIncident {
		if a.Log || silencedOrIgnored(a, event, si) {
			//a log or silenced/ignored alert will not need to be saved
		} else {
			incident.Id, err = s.DataAccess.State().UpdateIncidentState(incident)
			if err != nil {
				return
			}
		}
	}

	//render templates and open alert key if abnormal
	if event.Status > models.StNormal {
		s.executeTemplates(incident, event, a, r)
		incident.Open = true
		if a.Log {
			incident.Open = false
		}
	}

	// On state increase, clear old notifications and notify current.
	// Do nothing if state did not change.
	notify := func(ns *conf.Notifications) {
		if a.Log {
			lastLogTime := s.lastLogTimes[ak]
			now := utcNow()
			if now.Before(lastLogTime.Add(a.MaxLogFrequency)) {
				return
			}
			s.lastLogTimes[ak] = now
		}
		nots := ns.Get(s.Conf, incident.AlertKey.Group())
		for _, n := range nots {
			s.Notify(incident, n)
			checkNotify = true
		}
	}

	notifyCurrent := func() {
		//Auto close ignoreUnknowns for new incident.
		if silencedOrIgnored(a, event, si) {
			incident.Open = false
			return
		}
		// VICTOROPS INTEGRATION
		incident.NeedAck = false
		switch event.Status {
		case models.StCritical, models.StUnknown:
			notify(a.CritNotification)
		case models.StWarning:
			notify(a.WarnNotification)
		case models.StNormal:
			// VICTOROPS INTEGRATION
			incident.NeedAck = false
			notify(a.NormNotification)
		}
	}

	// lock while we change notifications.
	s.Lock("RunHistory")
	if shouldNotify {
		incident.NeedAck = false
		if err = s.DataAccess.Notifications().ClearNotifications(ak); err != nil {
			return
		}
		notifyCurrent()
	}

	// finally close an open alert with silence once it goes back to normal.
	if si := silenced(ak); si != nil && event.Status == models.StNormal {
		go func(ak models.AlertKey) {
			slog.Infof("auto close %s because was silenced", ak)
			err := s.Action("bosun", "Auto close because was silenced.", models.ActionClose, ak)
			if err != nil {
				slog.Errorln(err)
			}
		}(ak)
	}
	s.Unlock()
	return checkNotify, nil
}
Exemple #6
0
func procRule(t miniprofiler.Timer, c *conf.Conf, a *conf.Alert, now time.Time, summary bool, email string, template_group string) (*ruleResult, error) {
	s := &sched.Schedule{}
	s.DataAccess = schedule.DataAccess
	s.Search = schedule.Search
	if err := s.Init(c); err != nil {
		return nil, err
	}
	rh := s.NewRunHistory(now, cacheObj)
	if _, err := s.CheckExpr(t, rh, a, a.Warn, models.StWarning, nil); err != nil {
		return nil, err
	}
	if _, err := s.CheckExpr(t, rh, a, a.Crit, models.StCritical, nil); err != nil {
		return nil, err
	}
	keys := make(models.AlertKeys, len(rh.Events))
	criticals, warnings, normals := make([]models.AlertKey, 0), make([]models.AlertKey, 0), make([]models.AlertKey, 0)
	i := 0
	for k, v := range rh.Events {
		v.Time = now
		keys[i] = k
		i++
		switch v.Status {
		case models.StNormal:
			normals = append(normals, k)
		case models.StWarning:
			warnings = append(warnings, k)
		case models.StCritical:
			criticals = append(criticals, k)
		default:
			return nil, fmt.Errorf("unknown state type %v", v.Status)
		}
	}
	sort.Sort(keys)
	var subject, body []byte
	var data interface{}
	warning := make([]string, 0)

	if !summary && len(keys) > 0 {
		var primaryIncident *models.IncidentState
		if template_group != "" {
			ts, err := opentsdb.ParseTags(template_group)
			if err != nil {
				return nil, err
			}
			for _, ak := range keys {
				if ak.Group().Subset(ts) {
					primaryIncident = sched.NewIncident(ak)
					primaryIncident.Events = []models.Event{*rh.Events[ak]}
					break
				}
			}
		}
		if primaryIncident == nil {
			primaryIncident = sched.NewIncident(keys[0])
			primaryIncident.Events = []models.Event{*rh.Events[keys[0]]}
			if template_group != "" {
				warning = append(warning, fmt.Sprintf("template group %s was not a subset of any result", template_group))
			}
		}
		if e := primaryIncident.Events[0]; e.Crit != nil {
			primaryIncident.Result = e.Crit
		} else if e.Warn != nil {
			primaryIncident.Result = e.Warn
		}
		var b_err, s_err error
		func() {
			defer func() {
				if err := recover(); err != nil {
					s := fmt.Sprint(err)
					warning = append(warning, s)
					b_err = fmt.Errorf(s)
				}
			}()
			if body, _, b_err = s.ExecuteBody(rh, a, primaryIncident, false); b_err != nil {
				warning = append(warning, b_err.Error())
			}
		}()
		func() {
			defer func() {
				if err := recover(); err != nil {
					s := fmt.Sprint(err)
					warning = append(warning, s)
					s_err = fmt.Errorf(s)
				}
			}()
			subject, s_err = s.ExecuteSubject(rh, a, primaryIncident, false)
			if s_err != nil {
				warning = append(warning, s_err.Error())
			}
		}()
		if s_err != nil || b_err != nil {
			var err error
			subject, body, err = s.ExecuteBadTemplate([]error{s_err, b_err}, rh, a, primaryIncident)
			if err != nil {
				subject = []byte(fmt.Sprintf("unable to create tempalate error notification: %v", err))
			}
		} else if email != "" {
			m, err := mail.ParseAddress(email)
			if err != nil {
				return nil, err
			}
			n := conf.Notification{
				Email: []*mail.Address{m},
			}
			email, attachments, b_err := s.ExecuteBody(rh, a, primaryIncident, true)
			email_subject, s_err := s.ExecuteSubject(rh, a, primaryIncident, true)
			if b_err != nil {
				warning = append(warning, b_err.Error())
			} else if s_err != nil {
				warning = append(warning, s_err.Error())
			} else {
				n.DoEmail(email_subject, email, schedule.Conf, string(primaryIncident.AlertKey), attachments...)
			}
		}
		data = s.Data(rh, primaryIncident, a, false)
	}
	return &ruleResult{
		criticals,
		warnings,
		normals,
		now,
		string(body),
		string(subject),
		data,
		rh.Events,
		warning,
	}, nil
}
Exemple #7
0
func (d *dataAccess) save(s *models.IncidentState, isImport bool) error {
	defer collect.StartTimer("redis", opentsdb.TagSet{"op": "UpdateIncident"})()
	conn := d.GetConnection()
	defer conn.Close()

	isNew := false
	//if id is still zero, assign new id.
	if s.Id == 0 {
		id, err := redis.Int64(conn.Do("INCR", "maxIncidentId"))
		if err != nil {
			return slog.Wrap(err)
		}
		s.Id = id
		isNew = true
	} else if isImport {
		max, err := redis.Int64(conn.Do("GET", "maxIncidentId"))
		if err != nil {
			max = 0
		}
		if max < s.Id {
			if _, err = conn.Do("SET", "maxIncidentId", s.Id); err != nil {
				return slog.Wrap(err)
			}
		}
		isNew = true
	}
	return d.transact(conn, func() error {
		if isNew {
			// add to list for alert key
			if _, err := conn.Do("LPUSH", incidentsForAlertKeyKey(s.AlertKey), s.Id); err != nil {
				return slog.Wrap(err)
			}
			dat := fmt.Sprintf("%d:%d:%s", s.Id, s.Start.UTC().Unix(), s.AlertKey)
			if _, err := conn.Do("LPUSH", "allIncidents", dat); err != nil {
				return slog.Wrap(err)
			}
		}

		// store the incident json
		data, err := json.Marshal(s)
		if err != nil {
			return slog.Wrap(err)
		}
		_, err = conn.Do("SET", incidentStateKey(s.Id), data)

		addRem := func(b bool) string {
			if b {
				return "SADD"
			}
			return "SREM"
		}
		// appropriately add or remove it from the "open" set
		if s.Open {
			if _, err = conn.Do("HSET", statesOpenIncidentsKey, s.AlertKey, s.Id); err != nil {
				return slog.Wrap(err)
			}
		} else {
			if _, err = conn.Do("HDEL", statesOpenIncidentsKey, s.AlertKey); err != nil {
				return slog.Wrap(err)
			}
		}

		//appropriately add or remove from unknown and uneval sets
		if _, err = conn.Do(addRem(s.CurrentStatus == models.StUnknown), statesUnknownKey(s.Alert), s.AlertKey); err != nil {
			return slog.Wrap(err)
		}
		if _, err = conn.Do(addRem(s.Unevaluated), statesUnevalKey(s.Alert), s.AlertKey); err != nil {
			return slog.Wrap(err)
		}
		return nil
	})
}