func (s *Schedule) Action(user, message string, t models.ActionType, ak models.AlertKey) error { if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil { slog.Errorln(err) } st, err := s.DataAccess.State().GetLatestIncident(ak) if err != nil { return err } if st == nil { return fmt.Errorf("no such alert key: %v", ak) } isUnknown := st.LastAbnormalStatus == models.StUnknown timestamp := utcNow() switch t { case models.ActionAcknowledge: if !st.NeedAck { return fmt.Errorf("alert already acknowledged") } if !st.Open { return fmt.Errorf("cannot acknowledge closed alert") } st.NeedAck = false if err := s.DataAccess.Notifications().ClearNotifications(ak); err != nil { return err } case models.ActionClose: if st.IsActive() { return fmt.Errorf("cannot close active alert") } fallthrough case models.ActionForceClose: st.Open = false st.End = ×tamp case models.ActionForget: if !isUnknown { return fmt.Errorf("can only forget unknowns") } fallthrough case models.ActionPurge: return s.DataAccess.State().Forget(ak) default: return fmt.Errorf("unknown action type: %v", t) } // Would like to also track the alert group, but I believe this is impossible because any character // that could be used as a delimiter could also be a valid tag key or tag value character if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil { slog.Errorln(err) } st.Actions = append(st.Actions, models.Action{ Message: message, Time: timestamp, Type: t, User: user, }) _, err = s.DataAccess.State().UpdateIncidentState(st) return err }
func (s *Schedule) Unlock() { holder := s.mutexHolder start := s.mutexAquired waitTime := s.mutexWaitTime s.mutex.Unlock() collect.Add("schedule.lock_time", opentsdb.TagSet{"caller": holder, "op": "wait"}, waitTime) collect.Add("schedule.lock_time", opentsdb.TagSet{"caller": holder, "op": "hold"}, int64(time.Since(start)/time.Millisecond)) collect.Add("schedule.lock_count", opentsdb.TagSet{"caller": holder}, 1) }
func (s *Schedule) action(user, message string, t models.ActionType, st *models.IncidentState) (ak models.AlertKey, e error) { if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": st.AlertKey.Name(), "type": t.String()}, 1); err != nil { slog.Errorln(err) } defer func() { if e == nil { if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": st.AlertKey.Name(), "type": t.String()}, 1); err != nil { slog.Errorln(err) } if err := s.DataAccess.Notifications().ClearNotifications(st.AlertKey); err != nil { e = err } } }() isUnknown := st.LastAbnormalStatus == models.StUnknown timestamp := utcNow() switch t { case models.ActionAcknowledge: if !st.NeedAck { return "", fmt.Errorf("alert already acknowledged") } if !st.Open { return "", fmt.Errorf("cannot acknowledge closed alert") } st.NeedAck = false case models.ActionClose: if st.IsActive() { return "", fmt.Errorf("cannot close active alert") } fallthrough case models.ActionForceClose: st.Open = false st.End = ×tamp case models.ActionForget: if !isUnknown { return "", fmt.Errorf("can only forget unknowns") } fallthrough case models.ActionPurge: return st.AlertKey, s.DataAccess.State().Forget(st.AlertKey) case models.ActionNote: // pass default: return "", fmt.Errorf("unknown action type: %v", t) } st.Actions = append(st.Actions, models.Action{ Message: message, Time: timestamp, Type: t, User: user, }) _, err := s.DataAccess.State().UpdateIncidentState(st) return st.AlertKey, err }
func (rp *relayProxy) ServeHTTP(responseWriter http.ResponseWriter, r *http.Request) { clean := func(s string) string { return opentsdb.MustReplace(s, "_") } reader := &passthru{ReadCloser: r.Body} r.Body = reader w := &relayWriter{ResponseWriter: responseWriter} rp.ReverseProxy.ServeHTTP(w, r) indexTSDB(r, reader.buf.Bytes()) tags := opentsdb.TagSet{"path": clean(r.URL.Path), "remote": clean(strings.Split(r.RemoteAddr, ":")[0])} collect.Add("relay.bytes", tags, int64(reader.buf.Len())) tags["status"] = strconv.Itoa(w.code) collect.Add("relay.response", tags, 1) }
func (s *Schedule) Action(user, message string, t ActionType, ak expr.AlertKey) error { s.Lock("Action") defer s.Unlock() st := s.status[ak] if st == nil { return fmt.Errorf("no such alert key: %v", ak) } ack := func() { delete(s.Notifications, ak) st.NeedAck = false } isUnknown := st.AbnormalStatus() == StUnknown isError := st.AbnormalStatus() == StError timestamp := time.Now().UTC() switch t { case ActionAcknowledge: if !st.NeedAck { return fmt.Errorf("alert already acknowledged") } if !st.Open { return fmt.Errorf("cannot acknowledge closed alert") } ack() case ActionClose: if st.NeedAck { ack() } if st.IsActive() && !isError { return fmt.Errorf("cannot close active alert") } st.Open = false last := st.Last() if last.IncidentId != 0 { s.incidentLock.Lock() if incident, ok := s.Incidents[last.IncidentId]; ok { incident.End = ×tamp } s.incidentLock.Unlock() } case ActionForget: if !isUnknown { return fmt.Errorf("can only forget unknowns") } if st.NeedAck { ack() } st.Open = false st.Forgotten = true delete(s.status, ak) default: return fmt.Errorf("unknown action type: %v", t) } st.Action(user, message, t, timestamp) // Would like to also track the alert group, but I believe this is impossible because any character // that could be used as a delimiter could also be a valid tag key or tag value character if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil { log.Println(err) } return nil }
func (s *Search) Index(mdp opentsdb.MultiDataPoint) { for _, dp := range mdp { s.Lock() mmap := s.last[dp.Metric] if mmap == nil { mmap = make(map[string]*database.LastInfo) s.last[dp.Metric] = mmap } p := mmap[dp.Tags.String()] if p == nil { p = &database.LastInfo{} mmap[dp.Tags.String()] = p } if p.Timestamp < dp.Timestamp { if fv, err := getFloat(dp.Value); err == nil { p.DiffFromPrev = (fv - p.LastVal) / float64(dp.Timestamp-p.Timestamp) p.LastVal = fv } else { slog.Error(err) } p.Timestamp = dp.Timestamp } s.Unlock() select { case s.indexQueue <- dp: default: collect.Add("search.dropped", opentsdb.TagSet{}, 1) } } }
func (s *Search) Index(mdp opentsdb.MultiDataPoint) { for _, dp := range mdp { s.Lock() metric := dp.Metric key := metric + dp.Tags.String() p := s.Last[key] if p == nil { p = &lastInfo{} s.Last[key] = p } if p.timestamp < dp.Timestamp { if fv, err := getFloat(dp.Value); err == nil { p.diffFromPrev = (fv - p.lastVal) / float64(dp.Timestamp-p.timestamp) p.lastVal = fv } else { slog.Error(err) } p.timestamp = dp.Timestamp } s.Unlock() select { case s.indexQueue <- dp: default: collect.Add("search.dropped", opentsdb.TagSet{}, 1) } } }
func (rp *relayProxy) relayPut(responseWriter http.ResponseWriter, r *http.Request, parse bool) { isRelayed := r.Header.Get(relayHeader) != "" reader := &passthru{ReadCloser: r.Body} r.Body = reader w := &relayWriter{ResponseWriter: responseWriter} rp.TSDBProxy.ServeHTTP(w, r) if w.code/100 != 2 { verbose("got status %d", w.code) return } verbose("relayed to tsdb") collect.Add("puts.relayed", opentsdb.TagSet{}, 1) // Send to bosun in a separate go routine so we can end the source's request. go func() { body := bytes.NewBuffer(reader.buf.Bytes()) req, err := http.NewRequest(r.Method, bosunIndexURL, body) if err != nil { verbose("%v", err) return } resp, err := http.DefaultClient.Do(req) if err != nil { verbose("bosun relay error: %v", err) return } resp.Body.Close() verbose("bosun relay success") }() // Parse and denormalize datapoints if !isRelayed && parse && denormalizationRules != nil { go rp.denormalize(bytes.NewReader(reader.buf.Bytes())) } if !isRelayed && len(relayPutUrls) > 0 { go func() { for _, relayUrl := range relayPutUrls { body := bytes.NewBuffer(reader.buf.Bytes()) req, err := http.NewRequest(r.Method, relayUrl, body) if err != nil { verbose("%v", err) return } req.Header.Set("Content-Type", "application/json") req.Header.Set("Content-Encoding", "gzip") req.Header.Add(relayHeader, myHost) resp, err := http.DefaultClient.Do(req) if err != nil { verbose("secondary relay error: %v", err) return } resp.Body.Close() verbose("secondary relay success") } }() } }
func (n *Notification) DoEmail(subject, body []byte, c *Conf, ak string, attachments ...*Attachment) { e := email.NewEmail() e.From = c.EmailFrom for _, a := range n.Email { e.To = append(e.To, a.Address) } e.Subject = string(subject) e.HTML = body for _, a := range attachments { e.Attach(bytes.NewBuffer(a.Data), a.Filename, a.ContentType) } if err := Send(e, c.SMTPHost, c.SMTPUsername, c.SMTPPassword); err != nil { collect.Add("email.sent_failed", nil, 1) log.Printf("failed to send alert %v to %v %v\n", ak, e.To, err) return } collect.Add("email.sent", nil, 1) log.Printf("relayed alert %v to %v sucessfully\n", ak, e.To) }
func (n *Notification) DoEmail(subject, body []byte, c *Conf, ak string, attachments ...*Attachment) { e := email.NewEmail() e.From = c.EmailFrom for _, a := range n.Email { e.To = append(e.To, a.Address) } e.Subject = string(subject) e.HTML = body for _, a := range attachments { e.Attach(bytes.NewBuffer(a.Data), a.Filename, a.ContentType) } e.Headers.Add("X-Bosun-Server", util.Hostname) if err := Send(e, c.SMTPHost, c.SMTPUsername, c.SMTPPassword); err != nil { collect.Add("email.sent_failed", nil, 1) slog.Errorf("failed to send alert %v to %v %v\n", ak, e.To, err) return } collect.Add("email.sent", nil, 1) slog.Infof("relayed alert %v to %v sucessfully. Subject: %d bytes. Body: %d bytes.", ak, e.To, len(subject), len(body)) }
func indexTSDB(r *http.Request, body []byte) { clean := func(s string) string { return opentsdb.MustReplace(s, "_") } if r, err := gzip.NewReader(bytes.NewReader(body)); err == nil { body, _ = ioutil.ReadAll(r) r.Close() } var dp opentsdb.DataPoint var mdp opentsdb.MultiDataPoint if err := json.Unmarshal(body, &mdp); err == nil { } else if err = json.Unmarshal(body, &dp); err == nil { mdp = opentsdb.MultiDataPoint{&dp} } if len(mdp) > 0 { ra := strings.Split(r.RemoteAddr, ":")[0] tags := opentsdb.TagSet{"remote": clean(ra)} collect.Add("search.puts_relayed", tags, 1) collect.Add("search.datapoints_relayed", tags, int64(len(mdp))) schedule.Search.Index(mdp) } }
func (rp *relayProxy) relayMetadata(responseWriter http.ResponseWriter, r *http.Request) { reader := &passthru{ReadCloser: r.Body} r.Body = reader w := &relayWriter{ResponseWriter: responseWriter} rp.BosunProxy.ServeHTTP(w, r) if w.code != 204 { verbose("got status %d", w.code) return } verbose("relayed metadata to bosun") collect.Add("metadata.relayed", opentsdb.TagSet{}, 1) if r.Header.Get(relayHeader) != "" { return } if len(relayPutUrls) != 0 { go func() { for _, relayUrl := range relayPutUrls { relayUrl = strings.Replace(relayUrl, "/put", "/metadata/put", 1) body := bytes.NewBuffer(reader.buf.Bytes()) req, err := http.NewRequest(r.Method, relayUrl, body) if err != nil { verbose("%v", err) return } req.Header.Set("Content-Type", "application/json") req.Header.Add(relayHeader, myHost) resp, err := http.DefaultClient.Do(req) if err != nil { verbose("secondary relay error: %v", err) return } resp.Body.Close() verbose("secondary relay success") } }() } }
func (s *Schedule) CheckExpr(T miniprofiler.Timer, rh *RunHistory, a *conf.Alert, e *expr.Expr, checkStatus models.Status, ignore models.AlertKeys) (alerts models.AlertKeys, err error) { if e == nil { return } defer func() { if err == nil { return } collect.Add("check.errs", opentsdb.TagSet{"metric": a.Name}, 1) slog.Errorln(err) }() results, err := s.executeExpr(T, rh, a, e) if err != nil { return nil, err } Loop: for _, r := range results.Results { if s.Conf.Squelched(a, r.Group) { continue } ak := models.NewAlertKey(a.Name, r.Group) for _, v := range ignore { if ak == v { continue Loop } } var n float64 n, err = valueToFloat(r.Value) if err != nil { return } event := rh.Events[ak] if event == nil { event = new(models.Event) rh.Events[ak] = event } result := &models.Result{ Computations: r.Computations, Value: models.Float(n), Expr: e.String(), } switch checkStatus { case models.StWarning: event.Warn = result case models.StCritical: event.Crit = result } status := checkStatus if math.IsNaN(n) { status = checkStatus } else if n == 0 { status = models.StNormal } if status != models.StNormal { alerts = append(alerts, ak) } if status > rh.Events[ak].Status { event.Status = status } } return }
func (s *Schedule) CheckExpr(T miniprofiler.Timer, rh *RunHistory, a *conf.Alert, e *expr.Expr, checkStatus Status, ignore expr.AlertKeys) (alerts expr.AlertKeys, err error) { if e == nil { return } defer func() { if err == nil { return } collect.Add("check.errs", opentsdb.TagSet{"metric": a.Name}, 1) slog.Errorln(err) }() results, err := s.executeExpr(T, rh, a, e) if err != nil { return nil, err } Loop: for _, r := range results.Results { if s.Conf.Squelched(a, r.Group) { continue } ak := expr.NewAlertKey(a.Name, r.Group) for _, v := range ignore { if ak == v { continue Loop } } var n float64 switch v := r.Value.(type) { case expr.Number: n = float64(v) case expr.Scalar: n = float64(v) default: err = fmt.Errorf("expected number or scalar") return } event := rh.Events[ak] if event == nil { event = new(Event) rh.Events[ak] = event } result := &Result{ Result: r, Expr: e.String(), } switch checkStatus { case StWarning: event.Warn = result case StCritical: event.Crit = result } status := checkStatus if math.IsNaN(n) { status = StError } else if n == 0 { status = StNormal } if status != StNormal { alerts = append(alerts, ak) } if status > rh.Events[ak].Status { event.Status = status } } return }
func (s *Schedule) CheckExpr(T miniprofiler.Timer, rh *RunHistory, a *conf.Alert, e *expr.Expr, checkStatus models.Status, ignore models.AlertKeys) (alerts models.AlertKeys, err error, cancelled bool) { if e == nil { return } defer func() { if err == nil { return } collect.Add("check.errs", opentsdb.TagSet{"metric": a.Name}, 1) slog.Errorln(err) }() type res struct { results *expr.Results error error } // See s.CheckAlert for an explanation of execution and cancellation with this channel rc := make(chan res, 1) var results *expr.Results go func() { results, err := s.executeExpr(T, rh, a, e) rc <- res{results, err} }() select { case res := <-rc: results = res.results err = res.error case <-s.runnerContext.Done(): return nil, nil, true } if err != nil { return } Loop: for _, r := range results.Results { if s.RuleConf.Squelched(a, r.Group) { continue } ak := models.NewAlertKey(a.Name, r.Group) for _, v := range ignore { if ak == v { continue Loop } } var n float64 n, err = valueToFloat(r.Value) if err != nil { return } event := rh.Events[ak] if event == nil { event = new(models.Event) rh.Events[ak] = event } result := &models.Result{ Computations: r.Computations, Value: models.Float(n), Expr: e.String(), } switch checkStatus { case models.StWarning: event.Warn = result case models.StCritical: event.Crit = result } status := checkStatus if math.IsNaN(n) { status = checkStatus } else if n == 0 { status = models.StNormal } if status != models.StNormal { alerts = append(alerts, ak) } if status > rh.Events[ak].Status { event.Status = status } } return }