func (s *Schedule) Action(user, message string, t models.ActionType, ak models.AlertKey) error { if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil { slog.Errorln(err) } st, err := s.DataAccess.State().GetLatestIncident(ak) if err != nil { return err } if st == nil { return fmt.Errorf("no such alert key: %v", ak) } isUnknown := st.LastAbnormalStatus == models.StUnknown timestamp := utcNow() switch t { case models.ActionAcknowledge: if !st.NeedAck { return fmt.Errorf("alert already acknowledged") } if !st.Open { return fmt.Errorf("cannot acknowledge closed alert") } st.NeedAck = false if err := s.DataAccess.Notifications().ClearNotifications(ak); err != nil { return err } case models.ActionClose: if st.IsActive() { return fmt.Errorf("cannot close active alert") } fallthrough case models.ActionForceClose: st.Open = false st.End = ×tamp case models.ActionForget: if !isUnknown { return fmt.Errorf("can only forget unknowns") } fallthrough case models.ActionPurge: return s.DataAccess.State().Forget(ak) default: return fmt.Errorf("unknown action type: %v", t) } // Would like to also track the alert group, but I believe this is impossible because any character // that could be used as a delimiter could also be a valid tag key or tag value character if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil { slog.Errorln(err) } st.Actions = append(st.Actions, models.Action{ Message: message, Time: timestamp, Type: t, User: user, }) _, err = s.DataAccess.State().UpdateIncidentState(st) return err }
func (s *Schedule) action(user, message string, t models.ActionType, st *models.IncidentState) (ak models.AlertKey, e error) { if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": st.AlertKey.Name(), "type": t.String()}, 1); err != nil { slog.Errorln(err) } defer func() { if e == nil { if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": st.AlertKey.Name(), "type": t.String()}, 1); err != nil { slog.Errorln(err) } if err := s.DataAccess.Notifications().ClearNotifications(st.AlertKey); err != nil { e = err } } }() isUnknown := st.LastAbnormalStatus == models.StUnknown timestamp := utcNow() switch t { case models.ActionAcknowledge: if !st.NeedAck { return "", fmt.Errorf("alert already acknowledged") } if !st.Open { return "", fmt.Errorf("cannot acknowledge closed alert") } st.NeedAck = false case models.ActionClose: if st.IsActive() { return "", fmt.Errorf("cannot close active alert") } fallthrough case models.ActionForceClose: st.Open = false st.End = ×tamp case models.ActionForget: if !isUnknown { return "", fmt.Errorf("can only forget unknowns") } fallthrough case models.ActionPurge: return st.AlertKey, s.DataAccess.State().Forget(st.AlertKey) case models.ActionNote: // pass default: return "", fmt.Errorf("unknown action type: %v", t) } st.Actions = append(st.Actions, models.Action{ Message: message, Time: timestamp, Type: t, User: user, }) _, err := s.DataAccess.State().UpdateIncidentState(st) return st.AlertKey, err }
func (p *sqlplusParser) ParseAndAdd(line string) error { parsed, n := p.parsedQuery, len(sqlplusParsers) // query result separator is blank line if line == "" { return nil } // handle feed, end of one query if line == "no rows selected" || strings.HasSuffix(line, " rows selected.") || strings.HasSuffix(line, " row selected.") { p.parsedQuery++ return nil } // finished all queries if parsed == n { return nil } // process actual queries if err := sqlplusParsers[parsed].parse(line, p.md, p.prefix, p.common); err != nil { slog.Errorln("oracle sqlplus parser error:", err) } return nil }
func (s *Schedule) Action(user, message string, t ActionType, ak expr.AlertKey) error { s.Lock("Action") defer s.Unlock() st := s.status[ak] if st == nil { return fmt.Errorf("no such alert key: %v", ak) } ack := func() { delete(s.Notifications, ak) st.NeedAck = false } isUnknown := st.AbnormalStatus() == StUnknown isError := st.AbnormalStatus() == StError timestamp := time.Now().UTC() switch t { case ActionAcknowledge: if !st.NeedAck { return fmt.Errorf("alert already acknowledged") } if !st.Open { return fmt.Errorf("cannot acknowledge closed alert") } ack() case ActionClose: if st.NeedAck { ack() } if st.IsActive() && !isError { return fmt.Errorf("cannot close active alert") } st.Open = false last := st.Last() if last.IncidentId != 0 { s.incidentLock.Lock() if incident, ok := s.Incidents[last.IncidentId]; ok { incident.End = ×tamp } s.incidentLock.Unlock() } case ActionForget: if !isUnknown { return fmt.Errorf("can only forget unknowns") } if st.NeedAck { ack() } st.Open = false st.Forgotten = true delete(s.status, ak) default: return fmt.Errorf("unknown action type: %v", t) } st.Action(user, message, t, timestamp) // Would like to also track the alert group, but I believe this is impossible because any character // that could be used as a delimiter could also be a valid tag key or tag value character if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil { slog.Errorln(err) } return nil }
func sendBatch(batch []*opentsdb.DataPoint) { if Print { for _, d := range batch { j, err := d.MarshalJSON() if err != nil { slog.Error(err) } slog.Info(string(j)) } recordSent(len(batch)) return } now := time.Now() resp, err := SendDataPoints(batch, tsdbURLs[currentTsdbURL]) if err == nil { defer resp.Body.Close() } d := time.Since(now).Nanoseconds() / 1e6 Sample("collect.post.duration", Tags, float64(d)) Add("collect.post.total_duration", Tags, d) Add("collect.post.count", Tags, 1) // Some problem with connecting to the server; retry later. if err != nil || (resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK) { if err != nil { Add("collect.post.error", Tags, 1) slog.Error(err) // Switch endpoint if possible currentTsdbURL = (currentTsdbURL + 1) % len(tsdbURLs) } else if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK { Add("collect.post.bad_status", Tags, 1) slog.Errorln(resp.Status) body, err := ioutil.ReadAll(resp.Body) if err != nil { slog.Error(err) } if len(body) > 0 { slog.Error(string(body)) } // Switch endpoint if possible currentTsdbURL = (currentTsdbURL + 1) % len(tsdbURLs) } restored := 0 for _, msg := range batch { restored++ tchan <- msg } d := time.Second * 5 Add("collect.post.restore", Tags, int64(restored)) slog.Infof("restored %d, sleeping %s", restored, d) time.Sleep(d) return } recordSent(len(batch)) }
func (n *Notification) DoPost(subject []byte) { if n.Body != nil { buf := new(bytes.Buffer) if err := n.Body.Execute(buf, string(subject)); err != nil { slog.Errorln(err) return } subject = buf.Bytes() } resp, err := http.Post(n.Post.String(), n.ContentType, bytes.NewBuffer(subject)) if resp != nil && resp.Body != nil { defer resp.Body.Close() } if err != nil { slog.Error(err) return } if resp.StatusCode >= 300 { slog.Errorln("bad response on notification post:", resp.Status) } }
func c_snmp_ips(community, host string) (opentsdb.MultiDataPoint, error) { ifIPAdEntAddrRaw, err := snmp_subtree(host, community, ifIPAdEntAddr) if err != nil { return nil, err } ipAdEnts := make(map[string]*ipAdEntAddr) for id, value := range ifIPAdEntAddrRaw { // Split entry type id from ip address sp := strings.SplitN(id, ".", 2) if len(sp) != 2 { slog.Errorln("unexpected length of snmp resonse") } typeId := sp[0] address := sp[1] if _, ok := ipAdEnts[address]; !ok { ipAdEnts[address] = &ipAdEntAddr{} } switch typeId { case "1": if v, ok := value.([]byte); ok { ipAdEnts[address].IP = v } case "2": if v, ok := value.(int64); ok { ipAdEnts[address].InterfaceId = v } case "3": if v, ok := value.([]byte); ok { ipAdEnts[address].Mask = v } } } ipsByInt := make(map[int64][]net.IPNet) for _, ipNet := range ipAdEnts { ipsByInt[ipNet.InterfaceId] = append(ipsByInt[ipNet.InterfaceId], ipNet.IPNet) } for intId, ipNets := range ipsByInt { var ips []string for _, ipNet := range ipNets { ips = append(ips, ipNet.String()) } sort.Strings(ips) j, err := json.Marshal(ips) if err != nil { slog.Errorf("error marshaling ips for host %v: %v", host, err) } metadata.AddMeta("", opentsdb.TagSet{"host": host, "iface": fmt.Sprintf("%v", intId)}, "addresses", string(j), false) } return nil, nil }
func (n *Notification) DoPost(payload []byte, ak string) { if n.Body != nil { buf := new(bytes.Buffer) if err := n.Body.Execute(buf, string(payload)); err != nil { slog.Errorln(err) return } payload = buf.Bytes() } resp, err := http.Post(n.Post.String(), n.ContentType, bytes.NewBuffer(payload)) if resp != nil && resp.Body != nil { defer resp.Body.Close() } if err != nil { slog.Error(err) return } if resp.StatusCode >= 300 { slog.Errorln("bad response on notification post:", resp.Status) } else { slog.Infof("post notification successful for alert %s. Response code %d.", ak, resp.StatusCode) } }
func sendMetadata(ms []Metasend) { b, err := json.Marshal(&ms) if err != nil { slog.Error(err) return } resp, err := http.Post(metahost, "application/json", bytes.NewBuffer(b)) if err != nil { slog.Error(err) return } if resp.StatusCode != 204 { slog.Errorln("bad metadata return:", resp.Status) return } }
func (s *Schedule) ExecuteBody(rh *RunHistory, a *conf.Alert, st *State, isEmail bool) ([]byte, []*conf.Attachment, error) { t := a.Template if t == nil || t.Body == nil { return nil, nil, nil } c := s.Data(rh, st, a, isEmail) buf := new(bytes.Buffer) if err := t.Body.Execute(buf, c); err != nil { return nil, nil, err } if inline, err := inliner.Inline(buf.String()); err == nil { buf = bytes.NewBufferString(inline) } else { slog.Errorln(err) } return buf.Bytes(), c.Attachments, nil }
func sendMetadata(ms []Metasend) { b, err := json.Marshal(&ms) if err != nil { slog.Error(err) return } resp, err := http.Post(metahosts[currentmetahost], "application/json", bytes.NewBuffer(b)) if err != nil { slog.Error(err) currentmetahost = (currentmetahost + 1) % len(metahosts) return } defer resp.Body.Close() if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK { slog.Errorln("bad metadata return:", resp.Status) return } }
// utnotify is single notification for N unknown groups into a single notification func (s *Schedule) utnotify(groups map[string]models.AlertKeys, n *conf.Notification) { var total int now := utcNow() for _, group := range groups { // Don't know what the following line does, just copied from unotify s.Group[now] = group total += len(group) } subject := fmt.Sprintf("%v unknown alert instances suppressed", total) body := new(bytes.Buffer) if err := unknownMultiGroup.Execute(body, struct { Groups map[string]models.AlertKeys Threshold int }{ groups, s.SystemConf.GetUnknownThreshold(), }); err != nil { slog.Errorln(err) } n.Notify(subject, body.String(), []byte(subject), body.Bytes(), s.SystemConf, "unknown_treshold") }
func watch(root, pattern string, f func()) { watcher, err := fsnotify.NewWatcher() if err != nil { slog.Fatal(err) } filepath.Walk(root, func(path string, info os.FileInfo, err error) error { if matched, err := filepath.Match(pattern, info.Name()); err != nil { slog.Fatal(err) } else if !matched { return nil } err = watcher.Add(path) if err != nil { slog.Fatal(err) } return nil }) slog.Infoln("watching", pattern, "in", root) wait := time.Now() go func() { for { select { case event := <-watcher.Events: if wait.After(time.Now()) { continue } if event.Op&fsnotify.Write == fsnotify.Write { f() wait = time.Now().Add(time.Second * 2) } case err := <-watcher.Errors: slog.Errorln("error:", err) } } }() }
func pingHost(host string) { p := fastping.NewPinger() tags := opentsdb.TagSet{"dst_host": host} resolved := 0 defer func() { collect.Put("ping.resolved", tags, resolved) }() ra, err := net.ResolveIPAddr("ip4:icmp", host) if err != nil { return } resolved = 1 p.AddIPAddr(ra) p.MaxRTT = time.Second * 5 timeout := 1 p.OnRecv = func(addr *net.IPAddr, t time.Duration) { collect.Put("ping.rtt", tags, float64(t)/float64(time.Millisecond)) timeout = 0 } if err := p.Run(); err != nil { slog.Errorln(err) } collect.Put("ping.timeout", tags, timeout) }
// RunHistory for a single alert key. Returns true if notifications were altered. func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *Event, silenced map[models.AlertKey]models.Silence) bool { checkNotify := false // get existing state object for alert key. add to schedule status if doesn't already exist state := s.GetStatus(ak) if state == nil { state = NewStatus(ak) s.SetStatus(ak, state) } defer s.SetStatus(ak, state) // make sure we always touch the state. state.Touched = r.Start // set state.Result according to event result if event.Crit != nil { state.Result = event.Crit } else if event.Warn != nil { state.Result = event.Warn } // if event is unevaluated, we are done. state.Unevaluated = event.Unevaluated if event.Unevaluated { return checkNotify } // assign incident id to new event if applicable prev := state.Last() worst := StNormal event.Time = r.Start if prev.IncidentId != 0 { // If last event has incident id and is not closed, we continue it. incident, err := s.DataAccess.Incidents().GetIncident(prev.IncidentId) if err != nil { slog.Error(err) } else if incident.End == nil { event.IncidentId = prev.IncidentId worst = state.WorstThisIncident() } } if event.IncidentId == 0 && event.Status != StNormal { incident, err := s.createIncident(ak, event.Time) if err != nil { slog.Error("Error creating incident", err) } else { event.IncidentId = incident.Id } } state.Append(event) a := s.Conf.Alerts[ak.Name()] // render templates and open alert key if abnormal if event.Status > StNormal { s.executeTemplates(state, event, a, r) state.Open = true if a.Log { worst = StNormal state.Open = false } } // On state increase, clear old notifications and notify current. // If the old alert was not acknowledged, do nothing. // Do nothing if state did not change. notify := func(ns *conf.Notifications) { if a.Log { lastLogTime := state.LastLogTime now := time.Now() if now.Before(lastLogTime.Add(a.MaxLogFrequency)) { return } state.LastLogTime = now } nots := ns.Get(s.Conf, state.Group) for _, n := range nots { s.Notify(state, n) checkNotify = true } } notifyCurrent := func() { // Auto close ignoreUnknowns. if a.IgnoreUnknown && event.Status == StUnknown { state.Open = false state.Forgotten = true state.NeedAck = false state.Action("bosun", "Auto close because alert has ignoreUnknown.", ActionClose, event.Time) slog.Infof("auto close %s because alert has ignoreUnknown", ak) return } else if silenced[ak].Forget && event.Status == StUnknown { state.Open = false state.Forgotten = true state.NeedAck = false state.Action("bosun", "Auto close because alert is silenced and marked auto forget.", ActionClose, event.Time) slog.Infof("auto close %s because alert is silenced and marked auto forget", ak) return } state.NeedAck = true switch event.Status { case StCritical, StUnknown: notify(a.CritNotification) case StWarning: notify(a.WarnNotification) } } clearOld := func() { state.NeedAck = false delete(s.Notifications, ak) } // lock while we change notifications. s.Lock("RunHistory") if event.Status > worst { clearOld() notifyCurrent() } else if _, ok := silenced[ak]; ok && event.Status == StNormal { go func(ak models.AlertKey) { slog.Infof("auto close %s because was silenced", ak) err := s.Action("bosun", "Auto close because was silenced.", ActionClose, ak) if err != nil { slog.Errorln(err) } }(ak) } s.Unlock() return checkNotify }
// RestoreState restores notification and alert state from the file on disk. func (s *Schedule) RestoreState() error { defer func() { bosunStartupTime = time.Now() }() slog.Infoln("RestoreState") start := time.Now() s.Lock("RestoreState") defer s.Unlock() s.Search.Lock() defer s.Search.Unlock() s.Notifications = nil decode := func(name string, dst interface{}) error { var data []byte err := s.db.View(func(tx *bolt.Tx) error { b := tx.Bucket([]byte(dbBucket)) if b == nil { return fmt.Errorf("unknown bucket: %v", dbBucket) } data = b.Get([]byte(name)) return nil }) if err != nil { return err } gr, err := gzip.NewReader(bytes.NewReader(data)) if err != nil { return err } defer gr.Close() return gob.NewDecoder(gr).Decode(dst) } if err := decode(dbMetadata, &s.Metadata); err != nil { slog.Errorln(dbMetadata, err) } if err := decode(dbMetricMetadata, &s.metricMetadata); err != nil { slog.Errorln(dbMetricMetadata, err) } for k, v := range s.Metadata { if k.Name == "desc" || k.Name == "rate" || k.Name == "unit" { s.PutMetadata(k, v.Value) delete(s.Metadata, k) } } if err := decode(dbMetric, &s.Search.Metric); err != nil { slog.Errorln(dbMetric, err) } if err := decode(dbTagk, &s.Search.Tagk); err != nil { slog.Errorln(dbTagk, err) } if err := decode(dbTagv, &s.Search.Tagv); err != nil { slog.Errorln(dbTagv, err) } if err := decode(dbMetricTags, &s.Search.MetricTags); err != nil { slog.Errorln(dbMetricTags, err) } notifications := make(map[expr.AlertKey]map[string]time.Time) if err := decode(dbNotifications, ¬ifications); err != nil { slog.Errorln(dbNotifications, err) } if err := decode(dbSilence, &s.Silence); err != nil { slog.Errorln(dbSilence, err) } if err := decode(dbIncidents, &s.Incidents); err != nil { slog.Errorln(dbIncidents, err) } if err := decode(dbErrors, &s.AlertStatuses); err != nil { slog.Errorln(dbErrors, err) } // Calculate next incident id. for _, i := range s.Incidents { if i.Id > s.maxIncidentId { s.maxIncidentId = i.Id } } status := make(States) if err := decode(dbStatus, &status); err != nil { slog.Errorln(dbStatus, err) } clear := func(r *Result) { if r == nil { return } r.Computations = nil } for ak, st := range status { a, present := s.Conf.Alerts[ak.Name()] if !present { slog.Errorln("sched: alert no longer present, ignoring:", ak) continue } else if s.Conf.Squelched(a, st.Group) { slog.Infoln("sched: alert now squelched:", ak) continue } else { t := a.Unknown if t == 0 { t = s.Conf.CheckFrequency } if t == 0 && st.Last().Status == StUnknown { st.Append(&Event{Status: StNormal, IncidentId: st.Last().IncidentId}) } } clear(st.Result) newHistory := []Event{} for _, e := range st.History { clear(e.Warn) clear(e.Crit) // Remove error events which no longer are a thing. if e.Status <= StUnknown { newHistory = append(newHistory, e) } } st.History = newHistory s.status[ak] = st if a.Log && st.Open { st.Open = false slog.Infof("sched: alert %s is now log, closing, was %s", ak, st.Status()) } for name, t := range notifications[ak] { n, present := s.Conf.Notifications[name] if !present { slog.Infoln("sched: notification not present during restore:", name) continue } if a.Log { slog.Infoln("sched: alert is now log, removing notification:", ak) continue } s.AddNotification(ak, n, t) } } if s.maxIncidentId == 0 { s.createHistoricIncidents() } s.Search.Copy() slog.Infoln("RestoreState done in", time.Since(start)) return nil }
func (c *Conf) loadNotification(s *parse.SectionNode) { name := s.Name.Text if _, ok := c.Notifications[name]; ok { c.errorf("duplicate notification name: %s", name) } n := Notification{ Vars: make(map[string]string), ContentType: "application/x-www-form-urlencoded", Name: name, RunOnActions: true, } n.Text = s.RawText funcs := ttemplate.FuncMap{ "V": func(v string) string { return c.Expand(v, n.Vars, false) }, "json": func(v interface{}) string { b, err := json.Marshal(v) if err != nil { slog.Errorln(err) } return string(b) }, } c.Notifications[name] = &n pairs := c.getPairs(s, n.Vars, sNormal) for _, p := range pairs { c.at(p.node) v := p.val switch k := p.key; k { case "email": if c.SMTPHost == "" || c.EmailFrom == "" { c.errorf("email notifications require both smtpHost and emailFrom to be set") } n.email = v email, err := mail.ParseAddressList(n.email) if err != nil { c.error(err) } n.Email = email case "post": n.post = v post, err := url.Parse(n.post) if err != nil { c.error(err) } n.Post = post case "get": n.get = v get, err := url.Parse(n.get) if err != nil { c.error(err) } n.Get = get case "print": n.Print = true case "contentType": n.ContentType = v case "next": n.next = v next, ok := c.Notifications[n.next] if !ok { c.errorf("unknown notification %s", n.next) } n.Next = next case "timeout": d, err := opentsdb.ParseDuration(v) if err != nil { c.error(err) } n.Timeout = time.Duration(d) case "body": n.body = v tmpl := ttemplate.New(name).Funcs(funcs) _, err := tmpl.Parse(n.body) if err != nil { c.error(err) } n.Body = tmpl case "runOnActions": n.RunOnActions = v == "true" default: c.errorf("unknown key %s", k) } } c.at(s) if n.Timeout > 0 && n.Next == nil { c.errorf("timeout specified without next") } }
func (s *Schedule) CheckExpr(T miniprofiler.Timer, rh *RunHistory, a *conf.Alert, e *expr.Expr, checkStatus Status, ignore expr.AlertKeys) (alerts expr.AlertKeys, err error) { if e == nil { return } defer func() { if err == nil { return } collect.Add("check.errs", opentsdb.TagSet{"metric": a.Name}, 1) slog.Errorln(err) }() results, err := s.executeExpr(T, rh, a, e) if err != nil { return nil, err } Loop: for _, r := range results.Results { if s.Conf.Squelched(a, r.Group) { continue } ak := expr.NewAlertKey(a.Name, r.Group) for _, v := range ignore { if ak == v { continue Loop } } var n float64 switch v := r.Value.(type) { case expr.Number: n = float64(v) case expr.Scalar: n = float64(v) default: err = fmt.Errorf("expected number or scalar") return } event := rh.Events[ak] if event == nil { event = new(Event) rh.Events[ak] = event } result := &Result{ Result: r, Expr: e.String(), } switch checkStatus { case StWarning: event.Warn = result case StCritical: event.Crit = result } status := checkStatus if math.IsNaN(n) { status = StError } else if n == 0 { status = StNormal } if status != StNormal { alerts = append(alerts, ak) } if status > rh.Events[ak].Status { event.Status = status } } return }
// CollectStates sends various state information to bosun with collect. func (s *Schedule) CollectStates() { // [AlertName][Severity]Count severityCounts := make(map[string]map[string]int64) abnormalCounts := make(map[string]map[string]int64) ackStatusCounts := make(map[string]map[bool]int64) activeStatusCounts := make(map[string]map[bool]int64) // Initalize the Counts for _, alert := range s.Conf.Alerts { severityCounts[alert.Name] = make(map[string]int64) abnormalCounts[alert.Name] = make(map[string]int64) var i Status for i = 1; i.String() != "none"; i++ { severityCounts[alert.Name][i.String()] = 0 abnormalCounts[alert.Name][i.String()] = 0 } ackStatusCounts[alert.Name] = make(map[bool]int64) activeStatusCounts[alert.Name] = make(map[bool]int64) ackStatusCounts[alert.Name][false] = 0 activeStatusCounts[alert.Name][false] = 0 ackStatusCounts[alert.Name][true] = 0 activeStatusCounts[alert.Name][true] = 0 } for _, state := range s.status { if !state.Open { continue } severity := state.Status().String() lastAbnormal := state.AbnormalStatus().String() severityCounts[state.Alert][severity]++ abnormalCounts[state.Alert][lastAbnormal]++ ackStatusCounts[state.Alert][state.NeedAck]++ activeStatusCounts[state.Alert][state.IsActive()]++ } for alertName := range severityCounts { ts := opentsdb.TagSet{"alert": alertName} // The tagset of the alert is not included because there is no way to // store the string of a group in OpenTSBD in a parsable way. This is // because any delimiter we chose could also be part of a tag key or tag // value. for severity := range severityCounts[alertName] { err := collect.Put("alerts.current_severity", ts.Copy().Merge(opentsdb.TagSet{"severity": severity}), severityCounts[alertName][severity]) if err != nil { slog.Errorln(err) } err = collect.Put("alerts.last_abnormal_severity", ts.Copy().Merge(opentsdb.TagSet{"severity": severity}), abnormalCounts[alertName][severity]) if err != nil { slog.Errorln(err) } } err := collect.Put("alerts.acknowledgement_status", ts.Copy().Merge(opentsdb.TagSet{"status": "unacknowledged"}), ackStatusCounts[alertName][true]) err = collect.Put("alerts.acknowledgement_status", ts.Copy().Merge(opentsdb.TagSet{"status": "acknowledged"}), ackStatusCounts[alertName][false]) if err != nil { slog.Errorln(err) } err = collect.Put("alerts.active_status", ts.Copy().Merge(opentsdb.TagSet{"status": "active"}), activeStatusCounts[alertName][true]) if err != nil { slog.Errorln(err) } err = collect.Put("alerts.active_status", ts.Copy().Merge(opentsdb.TagSet{"status": "inactive"}), activeStatusCounts[alertName][false]) if err != nil { slog.Errorln(err) } } }
func (s *Schedule) CheckExpr(T miniprofiler.Timer, rh *RunHistory, a *conf.Alert, e *expr.Expr, checkStatus models.Status, ignore models.AlertKeys) (alerts models.AlertKeys, err error, cancelled bool) { if e == nil { return } defer func() { if err == nil { return } collect.Add("check.errs", opentsdb.TagSet{"metric": a.Name}, 1) slog.Errorln(err) }() type res struct { results *expr.Results error error } // See s.CheckAlert for an explanation of execution and cancellation with this channel rc := make(chan res, 1) var results *expr.Results go func() { results, err := s.executeExpr(T, rh, a, e) rc <- res{results, err} }() select { case res := <-rc: results = res.results err = res.error case <-s.runnerContext.Done(): return nil, nil, true } if err != nil { return } Loop: for _, r := range results.Results { if s.RuleConf.Squelched(a, r.Group) { continue } ak := models.NewAlertKey(a.Name, r.Group) for _, v := range ignore { if ak == v { continue Loop } } var n float64 n, err = valueToFloat(r.Value) if err != nil { return } event := rh.Events[ak] if event == nil { event = new(models.Event) rh.Events[ak] = event } result := &models.Result{ Computations: r.Computations, Value: models.Float(n), Expr: e.String(), } switch checkStatus { case models.StWarning: event.Warn = result case models.StCritical: event.Crit = result } status := checkStatus if math.IsNaN(n) { status = checkStatus } else if n == 0 { status = models.StNormal } if status != models.StNormal { alerts = append(alerts, ak) } if status > rh.Events[ak].Status { event.Status = status } } return }
func c_procstats_linux() (opentsdb.MultiDataPoint, error) { var md opentsdb.MultiDataPoint var Error error var sys unix.Sysinfo_t unix.Sysinfo(&sys) Add(&md, "linux.uptime_total", sys.Uptime, nil, metadata.Gauge, metadata.Second, osSystemUptimeDesc) Add(&md, osSystemUptime, sys.Uptime, nil, metadata.Gauge, metadata.Second, osSystemUptimeDesc) if err := readLine("/proc/meminfo", func(s string) error { s = strings.TrimSuffix(s, " kB") m := strings.Split(s, ":") if m == nil { return nil } m[1] = strings.TrimSpace(m[1]) Add(&md, "linux.mem."+strings.ToLower(m[0]), m[1], nil, metadata.Gauge, metadata.KBytes, "") return nil }); err != nil { Error = err slog.Errorln(err) } Add(&md, osMemTotal, sys.Totalram*uint64(sys.Unit), nil, metadata.Gauge, metadata.Bytes, osMemTotalDesc) Add(&md, osMemFree, sys.Freeram*uint64(sys.Unit), nil, metadata.Gauge, metadata.Bytes, osMemFreeDesc) Add(&md, osMemUsed, (sys.Totalram-sys.Freeram)*uint64(sys.Unit), nil, metadata.Gauge, metadata.Bytes, osMemUsedDesc) Add(&md, "linux.loadavg.1_min", sys.Loads[0], nil, metadata.Gauge, metadata.Load, "") Add(&md, "linux.loadavg.5_min", sys.Loads[1], nil, metadata.Gauge, metadata.Load, "") Add(&md, "linux.loadavg.15_min", sys.Loads[2], nil, metadata.Gauge, metadata.Load, "") Add(&md, "linux.loadavg.total_threads", sys.Procs, nil, metadata.Gauge, metadata.Process, "") if sys.Totalram != 0 { Add(&md, osMemPctFree, (sys.Freeram)/(sys.Totalram)*100, nil, metadata.Gauge, metadata.Pct, osMemFreeDesc) } if err := readLine("/proc/vmstat", func(s string) error { m := strings.Split(s, " ") if m == nil { return nil } switch m[0] { case "pgpgin", "pgpgout", "pswpin", "pswpout": switch { case strings.HasSuffix(m[0], "in"): Add(&md, "linux.mem."+strings.TrimSuffix(m[0], "in"), m[1], opentsdb.TagSet{"direction": "in"}, metadata.Counter, metadata.Page, "") case strings.HasSuffix(m[0], "out"): Add(&md, "linux.mem."+strings.TrimSuffix(m[0], "out"), m[1], opentsdb.TagSet{"direction": "out"}, metadata.Counter, metadata.Page, "") } case "pgfault", "pgmajfault": Add(&md, "linux.mem."+m[0], m[1], nil, metadata.Counter, metadata.Page, "") default: Add(&md, "linux.mem."+m[0], m[1], nil, metadata.Counter, metadata.None, "") } return nil }); err != nil { slog.Errorln(err) Error = err } num_cores := 0 var t_util int if err := readLine("/proc/stat", func(s string) error { m := strings.Fields(s) if m == nil { return nil } switch { case strings.HasPrefix(m[0], "cpu"): tag_cpu := strings.TrimPrefix(m[0], "cpu") if tag_cpu != "" { num_cores++ } for i, value := range m[1:] { if i >= len(cpu_fields) { break } tags := opentsdb.TagSet{ "type": cpu_fields[i], } if tag_cpu != "" { tags["cpu"] = tag_cpu Add(&md, "linux.cpu.percpu", value, tags, metadata.Counter, metadata.CHz, cpu_stat_desc[i]) } else { Add(&md, "linux.cpu", value, tags, metadata.Counter, metadata.CHz, cpu_stat_desc[i]) } } if tag_cpu == "" { if len(m[1:]) < 3 { return nil } user, err := strconv.Atoi(m[1]) if err != nil { slog.Errorln(err) return nil } nice, err := strconv.Atoi(m[2]) if err != nil { slog.Errorln(err) return nil } system, err := strconv.Atoi(m[3]) if err != nil { slog.Errorln(err) return nil } t_util = user + nice + system } case m[0] == "intr": Add(&md, "linux.intr", m[1], nil, metadata.Counter, metadata.Interupt, "") case m[0] == "ctxt": Add(&md, "linux.ctxt", m[1], nil, metadata.Counter, metadata.ContextSwitch, "") case m[0] == "processes": Add(&md, "linux.processes", m[1], nil, metadata.Counter, metadata.Process, "The number of processes and threads created, which includes (but is not limited to) those created by calls to the fork() and clone() system calls.") case m[0] == "procs_blocked": Add(&md, "linux.procs_blocked", m[1], nil, metadata.Gauge, metadata.Process, "The number of processes currently blocked, waiting for I/O to complete.") } return nil }); err != nil { slog.Errorln(err) Error = err } if num_cores != 0 && t_util != 0 { Add(&md, osCPU, t_util/num_cores, nil, metadata.Counter, metadata.Pct, "") } cpuinfo_index := 0 if err := readLine("/proc/cpuinfo", func(s string) error { m := strings.Split(s, ":") if len(m) < 2 { return nil } m[0] = strings.TrimSpace(m[0]) m[1] = strings.TrimSpace(m[1]) if m[0] != "cpu MHz" { return nil } tags := opentsdb.TagSet{"cpu": strconv.Itoa(cpuinfo_index)} Add(&md, osCPUClock, m[1], tags, metadata.Gauge, metadata.MHz, osCPUClockDesc) Add(&md, "linux.cpu.clock", m[1], tags, metadata.Gauge, metadata.MHz, osCPUClockDesc) cpuinfo_index++ return nil }); err != nil { slog.Errorln(err) Error = err } if err := readLine("/proc/sys/kernel/random/entropy_avail", func(s string) error { Add(&md, "linux.entropy_avail", strings.TrimSpace(s), nil, metadata.Gauge, metadata.Entropy, "The remaing amount of entropy available to the system. If it is low or hitting zero processes might be blocked waiting for extropy") return nil }); err != nil { slog.Errorln(err) Error = err } irq_type_desc := map[string]string{ "NMI": "Non-maskable interrupts.", "LOC": "Local timer interrupts.", "SPU": "Spurious interrupts.", "PMI": "Performance monitoring interrupts.", "IWI": "IRQ work interrupts.", "RES": "Rescheduling interrupts.", "CAL": "Funcation call interupts.", "TLB": "TLB (translation lookaside buffer) shootdowns.", "TRM": "Thermal event interrupts.", "THR": "Threshold APIC interrupts.", "MCE": "Machine check exceptions.", "MCP": "Machine Check polls.", } num_cpus := 0 if err := readLine("/proc/interrupts", func(s string) error { cols := strings.Fields(s) if num_cpus == 0 { num_cpus = len(cols) return nil } else if len(cols) < 2 { return nil } tags := opentsdb.TagSet{} irq := strings.TrimRight(cols[0], ":") tags["irq"] = irq if len(cols) == 2 { Add(&md, "linux.interrupts", cols[1], tags, metadata.Counter, metadata.Interupt, "") return nil } device := "" if _, err := strconv.Atoi(irq); err == nil { if len(cols) >= num_cpus+3 && (strings.HasPrefix(cols[num_cpus+1], "IR-") || strings.HasPrefix(cols[num_cpus+1], "IO-") || strings.HasPrefix(cols[num_cpus+1], "PCI-")) { device = strings.ToLower(strings.Join([]string{"linux.interrupts", cols[num_cpus+1]}, ".")) tags["dev"] = strings.Join(cols[len(cols)-1:], " ") } } for i, val := range cols[1:num_cpus] { tags["cpu"] = strconv.Itoa(i) if device == "" { Add(&md, "linux.interrupts", val, tags, metadata.Counter, metadata.Interupt, irq_type_desc[irq]) } else { Add(&md, device, val, tags, metadata.Counter, metadata.Interupt, irq_type_desc[irq]) } } return nil }); err != nil { slog.Errorln(err) Error = err } if err := readLine("/proc/net/sockstat", func(s string) error { cols := strings.Fields(s) switch cols[0] { case "sockets:": if len(cols) < 3 { return fmt.Errorf("sockstat: error parsing sockets line") } Add(&md, "linux.net.sockets.used", cols[2], nil, metadata.Gauge, metadata.Socket, "") case "TCP:": if len(cols) < 11 { return fmt.Errorf("sockstat: error parsing tcp line") } Add(&md, "linux.net.sockets.tcp_in_use", cols[2], nil, metadata.Gauge, metadata.Socket, "") Add(&md, "linux.net.sockets.tcp_orphaned", cols[4], nil, metadata.Gauge, metadata.Socket, "") Add(&md, "linux.net.sockets.tcp_time_wait", cols[6], nil, metadata.Gauge, metadata.Socket, "") Add(&md, "linux.net.sockets.tcp_allocated", cols[8], nil, metadata.Gauge, metadata.None, "") Add(&md, "linux.net.sockets.tcp_mem", cols[10], nil, metadata.Gauge, metadata.None, "") case "UDP:": if len(cols) < 5 { return fmt.Errorf("sockstat: error parsing udp line") } Add(&md, "linux.net.sockets.udp_in_use", cols[2], nil, metadata.Gauge, metadata.Socket, "") Add(&md, "linux.net.sockets.udp_mem", cols[4], nil, metadata.Gauge, metadata.Page, "") case "UDPLITE:": if len(cols) < 3 { return fmt.Errorf("sockstat: error parsing udplite line") } Add(&md, "linux.net.sockets.udplite_in_use", cols[2], nil, metadata.Gauge, metadata.Socket, "") case "RAW:": if len(cols) < 3 { return fmt.Errorf("sockstat: error parsing raw line") } Add(&md, "linux.net.sockets.raw_in_use", cols[2], nil, metadata.Gauge, metadata.Socket, "") case "FRAG:": if len(cols) < 5 { return fmt.Errorf("sockstat: error parsing frag line") } Add(&md, "linux.net.sockets.frag_in_use", cols[2], nil, metadata.Gauge, metadata.Socket, "") Add(&md, "linux.net.sockets.frag_mem", cols[4], nil, metadata.Gauge, metadata.Bytes, "") } return nil }); err != nil { slog.Errorln(err) Error = err } ln := 0 var headers []string if err := readLine("/proc/net/netstat", func(s string) error { cols := strings.Fields(s) if ln%2 == 0 { headers = cols } else { if len(cols) < 1 || len(cols) != len(headers) { return fmt.Errorf("netstat: parsing failed") } root := strings.ToLower(strings.TrimSuffix(headers[0], "Ext:")) for i, v := range cols[1:] { i++ m := "linux.net.stat." + root + "." + strings.TrimPrefix(strings.ToLower(headers[i]), "tcp") Add(&md, m, v, nil, metadata.Counter, metadata.None, "") } } ln++ return nil }); err != nil { slog.Errorln(err) Error = err } ln = 0 if err := readLine("/proc/net/snmp", func(s string) error { ln++ if ln%2 != 0 { f := strings.Fields(s) if len(f) < 2 { return fmt.Errorf("Failed to parse header line") } headers = f } else { values := strings.Fields(s) if len(values) != len(headers) { return fmt.Errorf("Mismatched header and value length") } proto := strings.ToLower(strings.TrimSuffix(values[0], ":")) for i, v := range values { if i == 0 { continue } var stype metadata.RateType = metadata.Counter stat := strings.ToLower(headers[i]) if strings.HasPrefix(stat, "rto") { stype = metadata.Gauge } Add(&md, "linux.net.stat."+proto+"."+stat, v, nil, stype, metadata.None, "") } } return nil }); err != nil { slog.Errorln(err) Error = err } if err := readLine("/proc/sys/fs/file-nr", func(s string) error { f := strings.Fields(s) if len(f) != 3 { return fmt.Errorf("unexpected number of fields") } v, err := strconv.ParseInt(f[0], 10, 64) if err != nil { slog.Errorln(err) return err } Add(&md, "linux.fs.open", v, nil, metadata.Gauge, metadata.Count, "The number of files presently open.") return nil }); err != nil { slog.Errorln(err) Error = err } return md, Error }
func c_varnish_unix() (opentsdb.MultiDataPoint, error) { var md opentsdb.MultiDataPoint const metric = "varnish." r, err := util.Command(5*time.Second, nil, "varnishstat", "-j") if err != nil { return nil, err } var stats varnishStats if err := json.NewDecoder(r).Decode(&stats); err != nil { return nil, err } for name, raw := range stats { if name == "timestamp" { continue } var v varnishStat if err := json.Unmarshal(raw, &v); err != nil { slog.Errorln("varnish parser error:", name, err) continue } ts := opentsdb.TagSet{} // special case for backend stats. extract backend name, host and port, put // them in tags and remove them in name. // the format is like "name(host,,port)" for the "ident" field of "VBE" type if v.Type == "VBE" { subtype := v.SubType name = strings.Replace(name, "."+subtype, "", -1) idx := strings.Index(subtype, "(") if idx < 0 || len(subtype)-idx < 4 { // output format changed, ignore continue } ss := strings.Split(subtype[idx+1:len(subtype)-1], ",") if len(ss) != 3 { // output format changed, ignore continue } ts.Merge(opentsdb.TagSet{"backend": subtype[:idx]}) ts.Merge(opentsdb.TagSet{"endpoint": ss[0] + "_" + ss[2]}) } rate := metadata.RateType(metadata.Gauge) if flag := v.Flag; flag == "a" || flag == "c" { rate = metadata.Counter } unit := metadata.Unit(metadata.Count) if v.Format == "B" { unit = metadata.Bytes } Add(&md, metric+strings.ToLower(name), v.Value, ts, rate, unit, v.Desc) } return md, nil }
// CollectStates sends various state information to bosun with collect. func (s *Schedule) CollectStates() { // [AlertName][Severity]Count severityCounts := make(map[string]map[string]int64) abnormalCounts := make(map[string]map[string]int64) ackStatusCounts := make(map[string]map[bool]int64) ackByNotificationCounts := make(map[string]map[bool]int64) unAckOldestByNotification := make(map[string]time.Time) activeStatusCounts := make(map[string]map[bool]int64) // Initalize the Counts for _, alert := range s.Conf.Alerts { severityCounts[alert.Name] = make(map[string]int64) abnormalCounts[alert.Name] = make(map[string]int64) var i models.Status for i = 1; i.String() != "none"; i++ { severityCounts[alert.Name][i.String()] = 0 abnormalCounts[alert.Name][i.String()] = 0 } ackStatusCounts[alert.Name] = make(map[bool]int64) activeStatusCounts[alert.Name] = make(map[bool]int64) ackStatusCounts[alert.Name][false] = 0 activeStatusCounts[alert.Name][false] = 0 ackStatusCounts[alert.Name][true] = 0 activeStatusCounts[alert.Name][true] = 0 } for notificationName := range s.Conf.Notifications { unAckOldestByNotification[notificationName] = time.Unix(1<<63-62135596801, 999999999) ackByNotificationCounts[notificationName] = make(map[bool]int64) ackByNotificationCounts[notificationName][false] = 0 ackByNotificationCounts[notificationName][true] = 0 } //TODO: // for _, state := range s.status { // if !state.Open { // continue // } // name := state.AlertKey.Name() // alertDef := s.Conf.Alerts[name] // nots := make(map[string]bool) // for name := range alertDef.WarnNotification.Get(s.Conf, state.Group) { // nots[name] = true // } // for name := range alertDef.CritNotification.Get(s.Conf, state.Group) { // nots[name] = true // } // incident, err := s.GetIncident(state.Last().IncidentId) // if err != nil { // slog.Errorln(err) // } // for notificationName := range nots { // ackByNotificationCounts[notificationName][state.NeedAck]++ // if incident != nil && incident.Start.Before(unAckOldestByNotification[notificationName]) && state.NeedAck { // unAckOldestByNotification[notificationName] = incident.Start // } // } // severity := state.CurrentStatus.String() // lastAbnormal := state.LastAbnormalStatus.String() // severityCounts[state.Alert][severity]++ // abnormalCounts[state.Alert][lastAbnormal]++ // ackStatusCounts[state.Alert][state.NeedAck]++ // activeStatusCounts[state.Alert][state.IsActive()]++ // } for notification := range ackByNotificationCounts { ts := opentsdb.TagSet{"notification": notification} err := collect.Put("alerts.acknowledgement_status_by_notification", ts.Copy().Merge(opentsdb.TagSet{"status": "unacknowledged"}), ackByNotificationCounts[notification][true]) if err != nil { slog.Errorln(err) } err = collect.Put("alerts.acknowledgement_status_by_notification", ts.Copy().Merge(opentsdb.TagSet{"status": "acknowledged"}), ackByNotificationCounts[notification][false]) if err != nil { slog.Errorln(err) } } for notification, timeStamp := range unAckOldestByNotification { ts := opentsdb.TagSet{"notification": notification} var ago time.Duration if !timeStamp.Equal(time.Unix(1<<63-62135596801, 999999999)) { ago = utcNow().Sub(timeStamp) } err := collect.Put("alerts.oldest_unacked_by_notification", ts, ago.Seconds()) if err != nil { slog.Errorln(err) } } for alertName := range severityCounts { ts := opentsdb.TagSet{"alert": alertName} // The tagset of the alert is not included because there is no way to // store the string of a group in OpenTSBD in a parsable way. This is // because any delimiter we chose could also be part of a tag key or tag // value. for severity := range severityCounts[alertName] { err := collect.Put("alerts.current_severity", ts.Copy().Merge(opentsdb.TagSet{"severity": severity}), severityCounts[alertName][severity]) if err != nil { slog.Errorln(err) } err = collect.Put("alerts.last_abnormal_severity", ts.Copy().Merge(opentsdb.TagSet{"severity": severity}), abnormalCounts[alertName][severity]) if err != nil { slog.Errorln(err) } } err := collect.Put("alerts.acknowledgement_status", ts.Copy().Merge(opentsdb.TagSet{"status": "unacknowledged"}), ackStatusCounts[alertName][true]) err = collect.Put("alerts.acknowledgement_status", ts.Copy().Merge(opentsdb.TagSet{"status": "acknowledged"}), ackStatusCounts[alertName][false]) if err != nil { slog.Errorln(err) } err = collect.Put("alerts.active_status", ts.Copy().Merge(opentsdb.TagSet{"status": "active"}), activeStatusCounts[alertName][true]) if err != nil { slog.Errorln(err) } err = collect.Put("alerts.active_status", ts.Copy().Merge(opentsdb.TagSet{"status": "inactive"}), activeStatusCounts[alertName][false]) if err != nil { slog.Errorln(err) } } }
func c_cisco_ios(host, community string, cpuIntegrator tsIntegrator) (opentsdb.MultiDataPoint, error) { var md opentsdb.MultiDataPoint ts := opentsdb.TagSet{"host": host} // CPU if err := ciscoCPU(host, community, ts, cpuIntegrator, &md); err != nil { return md, err } // ÎMemory memRaw, err := snmp_subtree(host, community, ciscoBaseOID+ciscoMemoryPoolTable) if err != nil { return md, fmt.Errorf("failed to get ciscoMemoryPoolTable for host %v: %v", host, err) } idToPoolEntry := make(map[string]*ciscoMemoryPoolEntry) for id, value := range memRaw { sp := strings.SplitN(id, ".", 2) if len(sp) != 2 { slog.Errorln("unexpected length of snmp sub OID (%v) for ciscoMemoryPoolTable for host %v: %v", id, host) } columnID := sp[0] entryID := sp[1] if _, ok := idToPoolEntry[entryID]; !ok { idToPoolEntry[entryID] = &ciscoMemoryPoolEntry{} } switch columnID { case "2": if v, ok := value.([]byte); ok { if m, ok := idToPoolEntry[entryID]; ok { m.PoolType = string(v) } else { slog.Errorf("failed to find cisco memory pool entry for entry id %v on host %v for memory pool type", entryID, host) } } else { slog.Errorf("failed to convert memory pool label %v to []byte for host %v", value, host) } case "5": if v, ok := value.(int64); ok { if m, ok := idToPoolEntry[entryID]; ok { m.Used = v } else { slog.Errorf("failed to find cisco memory pool entry for entry id %v on host %v for used memory", entryID, host) } } else { slog.Errorf("failed to convert used memory value %v to int64 for host %v", value, host) } case "6": if v, ok := value.(int64); ok { if m, ok := idToPoolEntry[entryID]; ok { m.Free = v } else { slog.Errorf("failed to find cisco memory pool entry for entry id %v on host %v for free memory", entryID, host) } } else { slog.Errorf("failed to convert used memory value %v to int64 for host %v", value, host) } } } var totalFreeMem int64 var totalUsedMem int64 for _, entry := range idToPoolEntry { ts := ts.Copy().Merge(opentsdb.TagSet{"name": entry.PoolType}) Add(&md, "cisco.mem.used", entry.Used, ts, metadata.Gauge, metadata.Bytes, ciscoMemoryPoolUsedDesc) Add(&md, "cisco.mem.free", entry.Free, ts, metadata.Gauge, metadata.Bytes, ciscoMemoryPoolFreeDesc) totalFreeMem += entry.Free totalUsedMem += entry.Used } Add(&md, osMemFree, totalFreeMem, ts, metadata.Gauge, metadata.Bytes, osMemFreeDesc) Add(&md, osMemUsed, totalUsedMem, ts, metadata.Gauge, metadata.Bytes, osMemUsedDesc) totalMem := totalFreeMem + totalUsedMem Add(&md, osMemTotal, totalMem, ts, metadata.Gauge, metadata.Bytes, osMemTotalDesc) Add(&md, osMemPctFree, int64(float64(totalFreeMem)/float64(totalMem)*100), ts, metadata.Gauge, metadata.Pct, osMemPctFreeDesc) return md, nil }
// RestoreState restores notification and alert state from the file on disk. func (s *Schedule) RestoreState() error { defer func() { bosunStartupTime = time.Now() }() slog.Infoln("RestoreState") start := time.Now() s.Lock("RestoreState") defer s.Unlock() s.Search.Lock() defer s.Search.Unlock() s.Notifications = nil db := s.db notifications := make(map[expr.AlertKey]map[string]time.Time) if err := decode(db, dbNotifications, ¬ifications); err != nil { slog.Errorln(dbNotifications, err) } if err := decode(db, dbSilence, &s.Silence); err != nil { slog.Errorln(dbSilence, err) } if err := decode(db, dbIncidents, &s.Incidents); err != nil { slog.Errorln(dbIncidents, err) } // Calculate next incident id. for _, i := range s.Incidents { if i.Id > s.maxIncidentId { s.maxIncidentId = i.Id } } status := make(States) if err := decode(db, dbStatus, &status); err != nil { slog.Errorln(dbStatus, err) } clear := func(r *Result) { if r == nil { return } r.Computations = nil } for ak, st := range status { a, present := s.Conf.Alerts[ak.Name()] if !present { slog.Errorln("sched: alert no longer present, ignoring:", ak) continue } else if s.Conf.Squelched(a, st.Group) { slog.Infoln("sched: alert now squelched:", ak) continue } else { t := a.Unknown if t == 0 { t = s.Conf.CheckFrequency } if t == 0 && st.Last().Status == StUnknown { st.Append(&Event{Status: StNormal, IncidentId: st.Last().IncidentId}) } } clear(st.Result) newHistory := []Event{} for _, e := range st.History { clear(e.Warn) clear(e.Crit) // Remove error events which no longer are a thing. if e.Status <= StUnknown { newHistory = append(newHistory, e) } } st.History = newHistory s.status[ak] = st if a.Log && st.Open { st.Open = false slog.Infof("sched: alert %s is now log, closing, was %s", ak, st.Status()) } for name, t := range notifications[ak] { n, present := s.Conf.Notifications[name] if !present { slog.Infoln("sched: notification not present during restore:", name) continue } if a.Log { slog.Infoln("sched: alert is now log, removing notification:", ak) continue } s.AddNotification(ak, n, t) } } if s.maxIncidentId == 0 { s.createHistoricIncidents() } migrateOldDataToRedis(db, s.DataAccess) // delete metrictags if they exist. deleteKey(s.db, "metrictags") slog.Infoln("RestoreState done in", time.Since(start)) return nil }
func sendBatch(batch []json.RawMessage) { if Print { for _, d := range batch { slog.Info(string(d)) } recordSent(len(batch)) return } var buf bytes.Buffer g := gzip.NewWriter(&buf) if err := json.NewEncoder(g).Encode(batch); err != nil { slog.Error(err) return } if err := g.Close(); err != nil { slog.Error(err) return } req, err := http.NewRequest("POST", tsdbURL, &buf) if err != nil { slog.Error(err) return } req.Header.Set("Content-Type", "application/json") req.Header.Set("Content-Encoding", "gzip") now := time.Now() resp, err := client.Do(req) d := time.Since(now).Nanoseconds() / 1e6 if err == nil { defer resp.Body.Close() } Add("collect.post.total_duration", Tags, d) Add("collect.post.count", Tags, 1) // Some problem with connecting to the server; retry later. if err != nil || resp.StatusCode != http.StatusNoContent { if err != nil { Add("collect.post.error", Tags, 1) slog.Error(err) } else if resp.StatusCode != http.StatusNoContent { Add("collect.post.bad_status", Tags, 1) slog.Errorln(resp.Status) body, err := ioutil.ReadAll(resp.Body) if err != nil { slog.Error(err) } if len(body) > 0 { slog.Error(string(body)) } } restored := 0 for _, msg := range batch { var dp opentsdb.DataPoint if err := json.Unmarshal(msg, &dp); err != nil { slog.Error(err) continue } restored++ tchan <- &dp } d := time.Second * 5 Add("collect.post.restore", Tags, int64(restored)) slog.Infof("restored %d, sleeping %s", restored, d) time.Sleep(d) return } recordSent(len(batch)) }
// RunHistory for a single alert key. Returns true if notifications were altered. func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.Event, silenced SilenceTester) (checkNotify bool, err error) { event.Time = r.Start a := s.Conf.Alerts[ak.Name()] if a.UnknownsNormal && event.Status == models.StUnknown { event.Status = models.StNormal } data := s.DataAccess.State() err = data.TouchAlertKey(ak, utcNow()) if err != nil { return } si := silenced(ak) // get existing open incident if exists var incident *models.IncidentState incident, err = data.GetOpenIncident(ak) if err != nil { return } defer func() { // save unless incident is new and closed (log alert) if incident != nil && (incident.Id != 0 || incident.Open) { _, err = data.UpdateIncidentState(incident) } else { err = data.SetUnevaluated(ak, event.Unevaluated) // if nothing to save, at least store the unevaluated state } }() // If nothing is out of the ordinary we are done if event.Status <= models.StNormal && incident == nil { return } // if event is unevaluated, we are done also. if incident != nil { incident.Unevaluated = event.Unevaluated } if event.Unevaluated { return } shouldNotify := false newIncident := false if incident == nil { incident = NewIncident(ak) newIncident = true shouldNotify = true } // VICTOROPS INTEGRATION: Enables notification of incidents which have returned to normal (Sends normNotification defined in config) if event.Status <= models.StNormal && (incident.CurrentStatus == models.StWarning || incident.CurrentStatus == models.StCritical) { slog.Infof("TRIGGER_RESOLVED: from %s to %s", incident.CurrentStatus, event.Status) shouldNotify = true } // VICTOROPS INTEGRATION: Enables notification of Incidents which have returned to normal but are now back to warning or critical. i.e. enable Flapping if incident.CurrentStatus == models.StNormal && (event.Status == models.StCritical || event.Status == models.StWarning) { slog.Infof("TRIGGER_REALERT: from %s to %s", incident.CurrentStatus, event.Status) shouldNotify = true } // set state.Result according to event result if event.Status == models.StCritical { incident.Result = event.Crit } else if event.Status == models.StWarning { incident.Result = event.Warn } if event.Status > models.StNormal { incident.LastAbnormalStatus = event.Status incident.LastAbnormalTime = event.Time.UTC().Unix() } if event.Status > incident.WorstStatus { incident.WorstStatus = event.Status shouldNotify = true } if event.Status != incident.CurrentStatus { incident.Events = append(incident.Events, *event) } incident.CurrentStatus = event.Status //run a preliminary save on new incidents to get an id if newIncident { if a.Log || silencedOrIgnored(a, event, si) { //a log or silenced/ignored alert will not need to be saved } else { incident.Id, err = s.DataAccess.State().UpdateIncidentState(incident) if err != nil { return } } } //render templates and open alert key if abnormal if event.Status > models.StNormal { s.executeTemplates(incident, event, a, r) incident.Open = true if a.Log { incident.Open = false } } // On state increase, clear old notifications and notify current. // Do nothing if state did not change. notify := func(ns *conf.Notifications) { if a.Log { lastLogTime := s.lastLogTimes[ak] now := utcNow() if now.Before(lastLogTime.Add(a.MaxLogFrequency)) { return } s.lastLogTimes[ak] = now } nots := ns.Get(s.Conf, incident.AlertKey.Group()) for _, n := range nots { s.Notify(incident, n) checkNotify = true } } notifyCurrent := func() { //Auto close ignoreUnknowns for new incident. if silencedOrIgnored(a, event, si) { incident.Open = false return } // VICTOROPS INTEGRATION incident.NeedAck = false switch event.Status { case models.StCritical, models.StUnknown: notify(a.CritNotification) case models.StWarning: notify(a.WarnNotification) case models.StNormal: // VICTOROPS INTEGRATION incident.NeedAck = false notify(a.NormNotification) } } // lock while we change notifications. s.Lock("RunHistory") if shouldNotify { incident.NeedAck = false if err = s.DataAccess.Notifications().ClearNotifications(ak); err != nil { return } notifyCurrent() } // finally close an open alert with silence once it goes back to normal. if si := silenced(ak); si != nil && event.Status == models.StNormal { go func(ak models.AlertKey) { slog.Infof("auto close %s because was silenced", ak) err := s.Action("bosun", "Auto close because was silenced.", models.ActionClose, ak) if err != nil { slog.Errorln(err) } }(ak) } s.Unlock() return checkNotify, nil }
// RunHistory for a single alert key. Returns true if notifications were altered. func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.Event, silenced SilenceTester) (checkNotify bool, err error) { event.Time = r.Start data := s.DataAccess.State() err = data.TouchAlertKey(ak, time.Now()) if err != nil { return } // get existing open incident if exists incident, err := data.GetOpenIncident(ak) if err != nil { return } defer func() { // save unless incident is new and closed (log alert) if incident != nil && (incident.Id != 0 || incident.Open) { err = data.UpdateIncidentState(incident) } else { err = data.SetUnevaluated(ak, event.Unevaluated) // if nothing to save, at least store the unevaluated state } }() // If nothing is out of the ordinary we are done if event.Status <= models.StNormal && incident == nil { return } // if event is unevaluated, we are done also. if incident != nil { incident.Unevaluated = event.Unevaluated } if event.Unevaluated { return } shouldNotify := false if incident == nil { incident = NewIncident(ak) shouldNotify = true } // set state.Result according to event result if event.Status == models.StCritical { incident.Result = event.Crit } else if event.Status == models.StWarning { incident.Result = event.Warn } if event.Status > models.StNormal { incident.LastAbnormalStatus = event.Status incident.LastAbnormalTime = event.Time.UTC().Unix() } if event.Status > incident.WorstStatus { incident.WorstStatus = event.Status shouldNotify = true } if event.Status != incident.CurrentStatus { incident.Events = append(incident.Events, *event) } incident.CurrentStatus = event.Status a := s.Conf.Alerts[ak.Name()] //render templates and open alert key if abnormal if event.Status > models.StNormal { s.executeTemplates(incident, event, a, r) incident.Open = true if a.Log { incident.Open = false } } // On state increase, clear old notifications and notify current. // Do nothing if state did not change. notify := func(ns *conf.Notifications) { if a.Log { lastLogTime := s.lastLogTimes[ak] now := time.Now() if now.Before(lastLogTime.Add(a.MaxLogFrequency)) { return } s.lastLogTimes[ak] = now } nots := ns.Get(s.Conf, incident.AlertKey.Group()) for _, n := range nots { s.Notify(incident, n) checkNotify = true } } notifyCurrent := func() { si := silenced(ak) //Auto close ignoreUnknowns for new incident. if a.IgnoreUnknown && event.Status == models.StUnknown { incident.Open = false return } else if si != nil && si.Forget && event.Status == models.StUnknown { incident.Open = false return } incident.NeedAck = true switch event.Status { case models.StCritical, models.StUnknown: notify(a.CritNotification) case models.StWarning: notify(a.WarnNotification) } } clearOld := func() { incident.NeedAck = false delete(s.Notifications, ak) } // lock while we change notifications. s.Lock("RunHistory") if shouldNotify { clearOld() notifyCurrent() } // finally close an open alert with silence once it goes back to normal. if si := silenced(ak); si != nil && event.Status == models.StNormal { go func(ak models.AlertKey) { slog.Infof("auto close %s because was silenced", ak) err := s.Action("bosun", "Auto close because was silenced.", models.ActionClose, ak) if err != nil { slog.Errorln(err) } }(ak) } s.Unlock() return checkNotify, nil }
func (s *Schedule) CheckExpr(T miniprofiler.Timer, rh *RunHistory, a *conf.Alert, e *expr.Expr, checkStatus models.Status, ignore models.AlertKeys) (alerts models.AlertKeys, err error) { if e == nil { return } defer func() { if err == nil { return } collect.Add("check.errs", opentsdb.TagSet{"metric": a.Name}, 1) slog.Errorln(err) }() results, err := s.executeExpr(T, rh, a, e) if err != nil { return nil, err } Loop: for _, r := range results.Results { if s.Conf.Squelched(a, r.Group) { continue } ak := models.NewAlertKey(a.Name, r.Group) for _, v := range ignore { if ak == v { continue Loop } } var n float64 n, err = valueToFloat(r.Value) if err != nil { return } event := rh.Events[ak] if event == nil { event = new(models.Event) rh.Events[ak] = event } result := &models.Result{ Computations: r.Computations, Value: models.Float(n), Expr: e.String(), } switch checkStatus { case models.StWarning: event.Warn = result case models.StCritical: event.Crit = result } status := checkStatus if math.IsNaN(n) { status = checkStatus } else if n == 0 { status = models.StNormal } if status != models.StNormal { alerts = append(alerts, ak) } if status > rh.Events[ak].Status { event.Status = status } } return }
// RestoreState restores notification and alert state from the file on disk. func (s *Schedule) RestoreState() error { defer func() { bosunStartupTime = time.Now() }() slog.Infoln("RestoreState") start := time.Now() s.Lock("RestoreState") defer s.Unlock() s.Search.Lock() defer s.Search.Unlock() s.Notifications = nil db := s.db notifications := make(map[models.AlertKey]map[string]time.Time) if err := decode(db, dbNotifications, ¬ifications); err != nil { slog.Errorln(dbNotifications, err) } //status := make(States) // if err := decode(db, dbStatus, &status); err != nil { // slog.Errorln(dbStatus, err) // } // clear := func(r *models.Result) { // if r == nil { // return // } // r.Computations = nil //} //TODO: ??? // for ak, st := range status { // a, present := s.Conf.Alerts[ak.Name()] // if !present { // slog.Errorln("sched: alert no longer present, ignoring:", ak) // continue // } else if s.Conf.Squelched(a, st.Group) { // slog.Infoln("sched: alert now squelched:", ak) // continue // } else { // t := a.Unknown // if t == 0 { // t = s.Conf.CheckFrequency // } // if t == 0 && st.Last().Status == StUnknown { // st.Append(&Event{Status: StNormal, IncidentId: st.Last().IncidentId}) // } // } // clear(st.Result) // newHistory := []Event{} // for _, e := range st.History { // clear(e.Warn) // clear(e.Crit) // // Remove error events which no longer are a thing. // if e.Status <= StUnknown { // newHistory = append(newHistory, e) // } // } // st.History = newHistory // s.status[ak] = st // if a.Log && st.Open { // st.Open = false // slog.Infof("sched: alert %s is now log, closing, was %s", ak, st.Status()) // } // for name, t := range notifications[ak] { // n, present := s.Conf.Notifications[name] // if !present { // slog.Infoln("sched: notification not present during restore:", name) // continue // } // if a.Log { // slog.Infoln("sched: alert is now log, removing notification:", ak) // continue // } // s.AddNotification(ak, n, t) // } //} if err := migrateOldDataToRedis(db, s.DataAccess); err != nil { return err } // delete metrictags if they exist. deleteKey(s.db, "metrictags") slog.Infoln("RestoreState done in", time.Since(start)) return nil }