示例#1
0
文件: web.go 项目: nicollet/bosun
func Status(t miniprofiler.Timer, w http.ResponseWriter, r *http.Request) (interface{}, error) {
	r.ParseForm()
	type ExtStatus struct {
		AlertName string
		*models.IncidentState
	}
	m := make(map[string]ExtStatus)
	for _, k := range r.Form["ak"] {
		ak, err := models.ParseAlertKey(k)
		if err != nil {
			return nil, err
		}
		var state *models.IncidentState
		if r.FormValue("all") != "" {
			allInc, err := schedule.DataAccess.State().GetAllIncidents(ak)
			if err != nil {
				return nil, err
			}
			if len(allInc) == 0 {
				return nil, fmt.Errorf("No incidents for alert key")
			}
			state = allInc[0]
			allEvents := models.EventsByTime{}
			for _, inc := range allInc {
				for _, e := range inc.Events {
					allEvents = append(allEvents, e)
				}
			}
			sort.Sort(allEvents)
			state.Events = allEvents
		} else {
			state, err = schedule.DataAccess.State().GetLatestIncident(ak)
			if err != nil {
				return nil, err
			}
		}
		st := ExtStatus{IncidentState: state}
		if st.IncidentState == nil {
			return nil, fmt.Errorf("unknown alert key: %v", k)
		}
		st.AlertName = ak.Name()
		m[k] = st
	}
	return m, nil
}
示例#2
0
文件: check.go 项目: Skyscanner/bosun
// RunHistory for a single alert key. Returns true if notifications were altered.
func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.Event, silenced SilenceTester) (checkNotify bool, err error) {
	event.Time = r.Start
	a := s.Conf.Alerts[ak.Name()]
	if a.UnknownsNormal && event.Status == models.StUnknown {
		event.Status = models.StNormal
	}

	data := s.DataAccess.State()
	err = data.TouchAlertKey(ak, utcNow())
	if err != nil {
		return
	}

	si := silenced(ak)

	// get existing open incident if exists
	var incident *models.IncidentState
	incident, err = data.GetOpenIncident(ak)
	if err != nil {
		return
	}
	defer func() {
		// save unless incident is new and closed (log alert)
		if incident != nil && (incident.Id != 0 || incident.Open) {
			_, err = data.UpdateIncidentState(incident)
		} else {
			err = data.SetUnevaluated(ak, event.Unevaluated) // if nothing to save, at least store the unevaluated state
		}
	}()
	// If nothing is out of the ordinary we are done
	if event.Status <= models.StNormal && incident == nil {
		return
	}

	// if event is unevaluated, we are done also.
	if incident != nil {
		incident.Unevaluated = event.Unevaluated
	}
	if event.Unevaluated {
		return
	}

	shouldNotify := false
	newIncident := false
	if incident == nil {
		incident = NewIncident(ak)
		newIncident = true
		shouldNotify = true
	}

	// VICTOROPS INTEGRATION: Enables notification of incidents which have returned to normal (Sends normNotification defined in config)
	if event.Status <= models.StNormal && (incident.CurrentStatus == models.StWarning || incident.CurrentStatus == models.StCritical) {
		slog.Infof("TRIGGER_RESOLVED: from %s to %s", incident.CurrentStatus, event.Status)
		shouldNotify = true
	}

	// VICTOROPS INTEGRATION:  Enables notification of Incidents which have returned to normal but are now back to warning or critical. i.e. enable Flapping
	if incident.CurrentStatus == models.StNormal && (event.Status == models.StCritical || event.Status == models.StWarning) {
		slog.Infof("TRIGGER_REALERT: from %s to %s", incident.CurrentStatus, event.Status)
		shouldNotify = true
	}

	// set state.Result according to event result
	if event.Status == models.StCritical {
		incident.Result = event.Crit
	} else if event.Status == models.StWarning {
		incident.Result = event.Warn
	}

	if event.Status > models.StNormal {
		incident.LastAbnormalStatus = event.Status
		incident.LastAbnormalTime = event.Time.UTC().Unix()
	}
	if event.Status > incident.WorstStatus {
		incident.WorstStatus = event.Status
		shouldNotify = true
	}
	if event.Status != incident.CurrentStatus {
		incident.Events = append(incident.Events, *event)
	}
	incident.CurrentStatus = event.Status

	//run a preliminary save on new incidents to get an id
	if newIncident {
		if a.Log || silencedOrIgnored(a, event, si) {
			//a log or silenced/ignored alert will not need to be saved
		} else {
			incident.Id, err = s.DataAccess.State().UpdateIncidentState(incident)
			if err != nil {
				return
			}
		}
	}

	//render templates and open alert key if abnormal
	if event.Status > models.StNormal {
		s.executeTemplates(incident, event, a, r)
		incident.Open = true
		if a.Log {
			incident.Open = false
		}
	}

	// On state increase, clear old notifications and notify current.
	// Do nothing if state did not change.
	notify := func(ns *conf.Notifications) {
		if a.Log {
			lastLogTime := s.lastLogTimes[ak]
			now := utcNow()
			if now.Before(lastLogTime.Add(a.MaxLogFrequency)) {
				return
			}
			s.lastLogTimes[ak] = now
		}
		nots := ns.Get(s.Conf, incident.AlertKey.Group())
		for _, n := range nots {
			s.Notify(incident, n)
			checkNotify = true
		}
	}

	notifyCurrent := func() {
		//Auto close ignoreUnknowns for new incident.
		if silencedOrIgnored(a, event, si) {
			incident.Open = false
			return
		}
		// VICTOROPS INTEGRATION
		incident.NeedAck = false
		switch event.Status {
		case models.StCritical, models.StUnknown:
			notify(a.CritNotification)
		case models.StWarning:
			notify(a.WarnNotification)
		case models.StNormal:
			// VICTOROPS INTEGRATION
			incident.NeedAck = false
			notify(a.NormNotification)
		}
	}

	// lock while we change notifications.
	s.Lock("RunHistory")
	if shouldNotify {
		incident.NeedAck = false
		if err = s.DataAccess.Notifications().ClearNotifications(ak); err != nil {
			return
		}
		notifyCurrent()
	}

	// finally close an open alert with silence once it goes back to normal.
	if si := silenced(ak); si != nil && event.Status == models.StNormal {
		go func(ak models.AlertKey) {
			slog.Infof("auto close %s because was silenced", ak)
			err := s.Action("bosun", "Auto close because was silenced.", models.ActionClose, ak)
			if err != nil {
				slog.Errorln(err)
			}
		}(ak)
	}
	s.Unlock()
	return checkNotify, nil
}
示例#3
0
文件: expr.go 项目: jareksm/bosun
func procRule(t miniprofiler.Timer, c *conf.Conf, a *conf.Alert, now time.Time, summary bool, email string, template_group string) (*ruleResult, error) {
	s := &sched.Schedule{}
	s.DataAccess = schedule.DataAccess
	s.Search = schedule.Search
	if err := s.Init(c); err != nil {
		return nil, err
	}
	rh := s.NewRunHistory(now, cacheObj)
	if _, err := s.CheckExpr(t, rh, a, a.Warn, models.StWarning, nil); err != nil {
		return nil, err
	}
	if _, err := s.CheckExpr(t, rh, a, a.Crit, models.StCritical, nil); err != nil {
		return nil, err
	}
	keys := make(models.AlertKeys, len(rh.Events))
	criticals, warnings, normals := make([]models.AlertKey, 0), make([]models.AlertKey, 0), make([]models.AlertKey, 0)
	i := 0
	for k, v := range rh.Events {
		v.Time = now
		keys[i] = k
		i++
		switch v.Status {
		case models.StNormal:
			normals = append(normals, k)
		case models.StWarning:
			warnings = append(warnings, k)
		case models.StCritical:
			criticals = append(criticals, k)
		default:
			return nil, fmt.Errorf("unknown state type %v", v.Status)
		}
	}
	sort.Sort(keys)
	var subject, body []byte
	var data interface{}
	warning := make([]string, 0)

	if !summary && len(keys) > 0 {
		var primaryIncident *models.IncidentState
		if template_group != "" {
			ts, err := opentsdb.ParseTags(template_group)
			if err != nil {
				return nil, err
			}
			for _, ak := range keys {
				if ak.Group().Subset(ts) {
					primaryIncident = sched.NewIncident(ak)
					primaryIncident.Events = []models.Event{*rh.Events[ak]}
					break
				}
			}
		}
		if primaryIncident == nil {
			primaryIncident = sched.NewIncident(keys[0])
			primaryIncident.Events = []models.Event{*rh.Events[keys[0]]}
			if template_group != "" {
				warning = append(warning, fmt.Sprintf("template group %s was not a subset of any result", template_group))
			}
		}
		if e := primaryIncident.Events[0]; e.Crit != nil {
			primaryIncident.Result = e.Crit
		} else if e.Warn != nil {
			primaryIncident.Result = e.Warn
		}
		var b_err, s_err error
		func() {
			defer func() {
				if err := recover(); err != nil {
					s := fmt.Sprint(err)
					warning = append(warning, s)
					b_err = fmt.Errorf(s)
				}
			}()
			if body, _, b_err = s.ExecuteBody(rh, a, primaryIncident, false); b_err != nil {
				warning = append(warning, b_err.Error())
			}
		}()
		func() {
			defer func() {
				if err := recover(); err != nil {
					s := fmt.Sprint(err)
					warning = append(warning, s)
					s_err = fmt.Errorf(s)
				}
			}()
			subject, s_err = s.ExecuteSubject(rh, a, primaryIncident, false)
			if s_err != nil {
				warning = append(warning, s_err.Error())
			}
		}()
		if s_err != nil || b_err != nil {
			var err error
			subject, body, err = s.ExecuteBadTemplate([]error{s_err, b_err}, rh, a, primaryIncident)
			if err != nil {
				subject = []byte(fmt.Sprintf("unable to create tempalate error notification: %v", err))
			}
		} else if email != "" {
			m, err := mail.ParseAddress(email)
			if err != nil {
				return nil, err
			}
			n := conf.Notification{
				Email: []*mail.Address{m},
			}
			email, attachments, b_err := s.ExecuteBody(rh, a, primaryIncident, true)
			email_subject, s_err := s.ExecuteSubject(rh, a, primaryIncident, true)
			if b_err != nil {
				warning = append(warning, b_err.Error())
			} else if s_err != nil {
				warning = append(warning, s_err.Error())
			} else {
				n.DoEmail(email_subject, email, schedule.Conf, string(primaryIncident.AlertKey), attachments...)
			}
		}
		data = s.Data(rh, primaryIncident, a, false)
	}
	return &ruleResult{
		criticals,
		warnings,
		normals,
		now,
		string(body),
		string(subject),
		data,
		rh.Events,
		warning,
	}, nil
}