Example #1
0
func (s *Schedule) Action(user, message string, t models.ActionType, ak models.AlertKey) error {
	if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil {
		slog.Errorln(err)
	}
	st, err := s.DataAccess.State().GetLatestIncident(ak)
	if err != nil {
		return err
	}
	if st == nil {
		return fmt.Errorf("no such alert key: %v", ak)
	}
	isUnknown := st.LastAbnormalStatus == models.StUnknown
	timestamp := utcNow()
	switch t {
	case models.ActionAcknowledge:
		if !st.NeedAck {
			return fmt.Errorf("alert already acknowledged")
		}
		if !st.Open {
			return fmt.Errorf("cannot acknowledge closed alert")
		}
		st.NeedAck = false
		if err := s.DataAccess.Notifications().ClearNotifications(ak); err != nil {
			return err
		}
	case models.ActionClose:
		if st.IsActive() {
			return fmt.Errorf("cannot close active alert")
		}
		fallthrough
	case models.ActionForceClose:
		st.Open = false
		st.End = &timestamp
	case models.ActionForget:
		if !isUnknown {
			return fmt.Errorf("can only forget unknowns")
		}
		fallthrough
	case models.ActionPurge:
		return s.DataAccess.State().Forget(ak)
	default:
		return fmt.Errorf("unknown action type: %v", t)
	}
	// Would like to also track the alert group, but I believe this is impossible because any character
	// that could be used as a delimiter could also be a valid tag key or tag value character
	if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil {
		slog.Errorln(err)
	}
	st.Actions = append(st.Actions, models.Action{
		Message: message,
		Time:    timestamp,
		Type:    t,
		User:    user,
	})
	_, err = s.DataAccess.State().UpdateIncidentState(st)
	return err
}
Example #2
0
func (s *Schedule) action(user, message string, t models.ActionType, st *models.IncidentState) (ak models.AlertKey, e error) {
	if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": st.AlertKey.Name(), "type": t.String()}, 1); err != nil {
		slog.Errorln(err)
	}
	defer func() {
		if e == nil {
			if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": st.AlertKey.Name(), "type": t.String()}, 1); err != nil {
				slog.Errorln(err)
			}
			if err := s.DataAccess.Notifications().ClearNotifications(st.AlertKey); err != nil {
				e = err
			}
		}
	}()
	isUnknown := st.LastAbnormalStatus == models.StUnknown
	timestamp := utcNow()
	switch t {
	case models.ActionAcknowledge:
		if !st.NeedAck {
			return "", fmt.Errorf("alert already acknowledged")
		}
		if !st.Open {
			return "", fmt.Errorf("cannot acknowledge closed alert")
		}
		st.NeedAck = false
	case models.ActionClose:
		if st.IsActive() {
			return "", fmt.Errorf("cannot close active alert")
		}
		fallthrough
	case models.ActionForceClose:
		st.Open = false
		st.End = &timestamp
	case models.ActionForget:
		if !isUnknown {
			return "", fmt.Errorf("can only forget unknowns")
		}
		fallthrough
	case models.ActionPurge:
		return st.AlertKey, s.DataAccess.State().Forget(st.AlertKey)
	case models.ActionNote:
		// pass
	default:
		return "", fmt.Errorf("unknown action type: %v", t)
	}
	st.Actions = append(st.Actions, models.Action{
		Message: message,
		Time:    timestamp,
		Type:    t,
		User:    user,
	})
	_, err := s.DataAccess.State().UpdateIncidentState(st)
	return st.AlertKey, err
}
Example #3
0
func (p *sqlplusParser) ParseAndAdd(line string) error {
	parsed, n := p.parsedQuery, len(sqlplusParsers)

	// query result separator is blank line
	if line == "" {
		return nil
	}

	// handle feed, end of one query
	if line == "no rows selected" || strings.HasSuffix(line, " rows selected.") ||
		strings.HasSuffix(line, " row selected.") {
		p.parsedQuery++
		return nil
	}

	// finished all queries
	if parsed == n {
		return nil
	}

	// process actual queries
	if err := sqlplusParsers[parsed].parse(line, p.md, p.prefix, p.common); err != nil {
		slog.Errorln("oracle sqlplus parser error:", err)
	}
	return nil
}
Example #4
0
func (s *Schedule) Action(user, message string, t ActionType, ak expr.AlertKey) error {
	s.Lock("Action")
	defer s.Unlock()
	st := s.status[ak]
	if st == nil {
		return fmt.Errorf("no such alert key: %v", ak)
	}
	ack := func() {
		delete(s.Notifications, ak)
		st.NeedAck = false
	}
	isUnknown := st.AbnormalStatus() == StUnknown
	isError := st.AbnormalStatus() == StError
	timestamp := time.Now().UTC()
	switch t {
	case ActionAcknowledge:
		if !st.NeedAck {
			return fmt.Errorf("alert already acknowledged")
		}
		if !st.Open {
			return fmt.Errorf("cannot acknowledge closed alert")
		}
		ack()
	case ActionClose:
		if st.NeedAck {
			ack()
		}
		if st.IsActive() && !isError {
			return fmt.Errorf("cannot close active alert")
		}
		st.Open = false
		last := st.Last()
		if last.IncidentId != 0 {
			s.incidentLock.Lock()
			if incident, ok := s.Incidents[last.IncidentId]; ok {
				incident.End = &timestamp
			}
			s.incidentLock.Unlock()
		}
	case ActionForget:
		if !isUnknown {
			return fmt.Errorf("can only forget unknowns")
		}
		if st.NeedAck {
			ack()
		}
		st.Open = false
		st.Forgotten = true
		delete(s.status, ak)
	default:
		return fmt.Errorf("unknown action type: %v", t)
	}
	st.Action(user, message, t, timestamp)
	// Would like to also track the alert group, but I believe this is impossible because any character
	// that could be used as a delimiter could also be a valid tag key or tag value character
	if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil {
		slog.Errorln(err)
	}
	return nil
}
Example #5
0
func sendBatch(batch []*opentsdb.DataPoint) {
	if Print {
		for _, d := range batch {
			j, err := d.MarshalJSON()
			if err != nil {
				slog.Error(err)
			}
			slog.Info(string(j))
		}
		recordSent(len(batch))
		return
	}

	now := time.Now()
	resp, err := SendDataPoints(batch, tsdbURLs[currentTsdbURL])
	if err == nil {
		defer resp.Body.Close()
	}
	d := time.Since(now).Nanoseconds() / 1e6
	Sample("collect.post.duration", Tags, float64(d))
	Add("collect.post.total_duration", Tags, d)
	Add("collect.post.count", Tags, 1)
	// Some problem with connecting to the server; retry later.
	if err != nil || (resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK) {
		if err != nil {
			Add("collect.post.error", Tags, 1)
			slog.Error(err)
			// Switch endpoint if possible
			currentTsdbURL = (currentTsdbURL + 1) % len(tsdbURLs)
		} else if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK {
			Add("collect.post.bad_status", Tags, 1)
			slog.Errorln(resp.Status)
			body, err := ioutil.ReadAll(resp.Body)
			if err != nil {
				slog.Error(err)
			}
			if len(body) > 0 {
				slog.Error(string(body))
			}
			// Switch endpoint if possible
			currentTsdbURL = (currentTsdbURL + 1) % len(tsdbURLs)
		}
		restored := 0
		for _, msg := range batch {
			restored++
			tchan <- msg
		}
		d := time.Second * 5
		Add("collect.post.restore", Tags, int64(restored))
		slog.Infof("restored %d, sleeping %s", restored, d)
		time.Sleep(d)
		return
	}
	recordSent(len(batch))
}
Example #6
0
func (n *Notification) DoPost(subject []byte) {
	if n.Body != nil {
		buf := new(bytes.Buffer)
		if err := n.Body.Execute(buf, string(subject)); err != nil {
			slog.Errorln(err)
			return
		}
		subject = buf.Bytes()
	}
	resp, err := http.Post(n.Post.String(), n.ContentType, bytes.NewBuffer(subject))
	if resp != nil && resp.Body != nil {
		defer resp.Body.Close()
	}
	if err != nil {
		slog.Error(err)
		return
	}
	if resp.StatusCode >= 300 {
		slog.Errorln("bad response on notification post:", resp.Status)
	}
}
Example #7
0
func c_snmp_ips(community, host string) (opentsdb.MultiDataPoint, error) {
	ifIPAdEntAddrRaw, err := snmp_subtree(host, community, ifIPAdEntAddr)
	if err != nil {
		return nil, err
	}
	ipAdEnts := make(map[string]*ipAdEntAddr)
	for id, value := range ifIPAdEntAddrRaw {
		// Split entry type id from ip address
		sp := strings.SplitN(id, ".", 2)
		if len(sp) != 2 {
			slog.Errorln("unexpected length of snmp resonse")
		}
		typeId := sp[0]
		address := sp[1]
		if _, ok := ipAdEnts[address]; !ok {
			ipAdEnts[address] = &ipAdEntAddr{}
		}
		switch typeId {
		case "1":
			if v, ok := value.([]byte); ok {
				ipAdEnts[address].IP = v
			}
		case "2":
			if v, ok := value.(int64); ok {
				ipAdEnts[address].InterfaceId = v
			}
		case "3":
			if v, ok := value.([]byte); ok {
				ipAdEnts[address].Mask = v
			}
		}
	}
	ipsByInt := make(map[int64][]net.IPNet)
	for _, ipNet := range ipAdEnts {
		ipsByInt[ipNet.InterfaceId] = append(ipsByInt[ipNet.InterfaceId], ipNet.IPNet)
	}
	for intId, ipNets := range ipsByInt {
		var ips []string
		for _, ipNet := range ipNets {
			ips = append(ips, ipNet.String())
		}
		sort.Strings(ips)
		j, err := json.Marshal(ips)
		if err != nil {
			slog.Errorf("error marshaling ips for host %v: %v", host, err)
		}
		metadata.AddMeta("", opentsdb.TagSet{"host": host, "iface": fmt.Sprintf("%v", intId)}, "addresses", string(j), false)
	}
	return nil, nil
}
Example #8
0
func (n *Notification) DoPost(payload []byte, ak string) {
	if n.Body != nil {
		buf := new(bytes.Buffer)
		if err := n.Body.Execute(buf, string(payload)); err != nil {
			slog.Errorln(err)
			return
		}
		payload = buf.Bytes()
	}
	resp, err := http.Post(n.Post.String(), n.ContentType, bytes.NewBuffer(payload))
	if resp != nil && resp.Body != nil {
		defer resp.Body.Close()
	}
	if err != nil {
		slog.Error(err)
		return
	}
	if resp.StatusCode >= 300 {
		slog.Errorln("bad response on notification post:", resp.Status)
	} else {
		slog.Infof("post notification successful for alert %s. Response code %d.", ak, resp.StatusCode)
	}
}
Example #9
0
func sendMetadata(ms []Metasend) {
	b, err := json.Marshal(&ms)
	if err != nil {
		slog.Error(err)
		return
	}
	resp, err := http.Post(metahost, "application/json", bytes.NewBuffer(b))
	if err != nil {
		slog.Error(err)
		return
	}
	if resp.StatusCode != 204 {
		slog.Errorln("bad metadata return:", resp.Status)
		return
	}
}
Example #10
0
func (s *Schedule) ExecuteBody(rh *RunHistory, a *conf.Alert, st *State, isEmail bool) ([]byte, []*conf.Attachment, error) {
	t := a.Template
	if t == nil || t.Body == nil {
		return nil, nil, nil
	}
	c := s.Data(rh, st, a, isEmail)
	buf := new(bytes.Buffer)
	if err := t.Body.Execute(buf, c); err != nil {
		return nil, nil, err
	}
	if inline, err := inliner.Inline(buf.String()); err == nil {
		buf = bytes.NewBufferString(inline)
	} else {
		slog.Errorln(err)
	}
	return buf.Bytes(), c.Attachments, nil
}
Example #11
0
func sendMetadata(ms []Metasend) {
	b, err := json.Marshal(&ms)
	if err != nil {
		slog.Error(err)
		return
	}
	resp, err := http.Post(metahosts[currentmetahost], "application/json", bytes.NewBuffer(b))
	if err != nil {
		slog.Error(err)
		currentmetahost = (currentmetahost + 1) % len(metahosts)
		return
	}
	defer resp.Body.Close()
	if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK {
		slog.Errorln("bad metadata return:", resp.Status)
		return
	}
}
Example #12
0
// utnotify is single notification for N unknown groups into a single notification
func (s *Schedule) utnotify(groups map[string]models.AlertKeys, n *conf.Notification) {
	var total int
	now := utcNow()
	for _, group := range groups {
		// Don't know what the following line does, just copied from unotify
		s.Group[now] = group
		total += len(group)
	}
	subject := fmt.Sprintf("%v unknown alert instances suppressed", total)
	body := new(bytes.Buffer)
	if err := unknownMultiGroup.Execute(body, struct {
		Groups    map[string]models.AlertKeys
		Threshold int
	}{
		groups,
		s.SystemConf.GetUnknownThreshold(),
	}); err != nil {
		slog.Errorln(err)
	}
	n.Notify(subject, body.String(), []byte(subject), body.Bytes(), s.SystemConf, "unknown_treshold")
}
Example #13
0
func watch(root, pattern string, f func()) {
	watcher, err := fsnotify.NewWatcher()
	if err != nil {
		slog.Fatal(err)
	}
	filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
		if matched, err := filepath.Match(pattern, info.Name()); err != nil {
			slog.Fatal(err)
		} else if !matched {
			return nil
		}
		err = watcher.Add(path)
		if err != nil {
			slog.Fatal(err)
		}
		return nil
	})
	slog.Infoln("watching", pattern, "in", root)
	wait := time.Now()
	go func() {
		for {
			select {
			case event := <-watcher.Events:
				if wait.After(time.Now()) {
					continue
				}
				if event.Op&fsnotify.Write == fsnotify.Write {
					f()
					wait = time.Now().Add(time.Second * 2)
				}
			case err := <-watcher.Errors:
				slog.Errorln("error:", err)
			}
		}
	}()
}
Example #14
0
func pingHost(host string) {
	p := fastping.NewPinger()
	tags := opentsdb.TagSet{"dst_host": host}
	resolved := 0
	defer func() {
		collect.Put("ping.resolved", tags, resolved)
	}()
	ra, err := net.ResolveIPAddr("ip4:icmp", host)
	if err != nil {
		return
	}
	resolved = 1
	p.AddIPAddr(ra)
	p.MaxRTT = time.Second * 5
	timeout := 1
	p.OnRecv = func(addr *net.IPAddr, t time.Duration) {
		collect.Put("ping.rtt", tags, float64(t)/float64(time.Millisecond))
		timeout = 0
	}
	if err := p.Run(); err != nil {
		slog.Errorln(err)
	}
	collect.Put("ping.timeout", tags, timeout)
}
Example #15
0
File: check.go Project: eswdd/bosun
// RunHistory for a single alert key. Returns true if notifications were altered.
func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *Event, silenced map[models.AlertKey]models.Silence) bool {
	checkNotify := false
	// get existing state object for alert key. add to schedule status if doesn't already exist
	state := s.GetStatus(ak)
	if state == nil {
		state = NewStatus(ak)
		s.SetStatus(ak, state)
	}
	defer s.SetStatus(ak, state)
	// make sure we always touch the state.
	state.Touched = r.Start
	// set state.Result according to event result
	if event.Crit != nil {
		state.Result = event.Crit
	} else if event.Warn != nil {
		state.Result = event.Warn
	}
	// if event is unevaluated, we are done.
	state.Unevaluated = event.Unevaluated
	if event.Unevaluated {
		return checkNotify
	}
	// assign incident id to new event if applicable
	prev := state.Last()
	worst := StNormal
	event.Time = r.Start
	if prev.IncidentId != 0 {
		// If last event has incident id and is not closed, we continue it.
		incident, err := s.DataAccess.Incidents().GetIncident(prev.IncidentId)
		if err != nil {
			slog.Error(err)
		} else if incident.End == nil {
			event.IncidentId = prev.IncidentId
			worst = state.WorstThisIncident()
		}
	}
	if event.IncidentId == 0 && event.Status != StNormal {
		incident, err := s.createIncident(ak, event.Time)
		if err != nil {
			slog.Error("Error creating incident", err)
		} else {
			event.IncidentId = incident.Id
		}
	}

	state.Append(event)
	a := s.Conf.Alerts[ak.Name()]
	// render templates and open alert key if abnormal
	if event.Status > StNormal {
		s.executeTemplates(state, event, a, r)
		state.Open = true
		if a.Log {
			worst = StNormal
			state.Open = false
		}
	}
	// On state increase, clear old notifications and notify current.
	// If the old alert was not acknowledged, do nothing.
	// Do nothing if state did not change.
	notify := func(ns *conf.Notifications) {
		if a.Log {
			lastLogTime := state.LastLogTime
			now := time.Now()
			if now.Before(lastLogTime.Add(a.MaxLogFrequency)) {
				return
			}
			state.LastLogTime = now
		}
		nots := ns.Get(s.Conf, state.Group)
		for _, n := range nots {
			s.Notify(state, n)
			checkNotify = true
		}
	}
	notifyCurrent := func() {
		// Auto close ignoreUnknowns.
		if a.IgnoreUnknown && event.Status == StUnknown {
			state.Open = false
			state.Forgotten = true
			state.NeedAck = false
			state.Action("bosun", "Auto close because alert has ignoreUnknown.", ActionClose, event.Time)
			slog.Infof("auto close %s because alert has ignoreUnknown", ak)
			return
		} else if silenced[ak].Forget && event.Status == StUnknown {
			state.Open = false
			state.Forgotten = true
			state.NeedAck = false
			state.Action("bosun", "Auto close because alert is silenced and marked auto forget.", ActionClose, event.Time)
			slog.Infof("auto close %s because alert is silenced and marked auto forget", ak)
			return
		}
		state.NeedAck = true
		switch event.Status {
		case StCritical, StUnknown:
			notify(a.CritNotification)
		case StWarning:
			notify(a.WarnNotification)
		}
	}
	clearOld := func() {
		state.NeedAck = false
		delete(s.Notifications, ak)
	}

	// lock while we change notifications.
	s.Lock("RunHistory")
	if event.Status > worst {
		clearOld()
		notifyCurrent()
	} else if _, ok := silenced[ak]; ok && event.Status == StNormal {
		go func(ak models.AlertKey) {
			slog.Infof("auto close %s because was silenced", ak)
			err := s.Action("bosun", "Auto close because was silenced.", ActionClose, ak)
			if err != nil {
				slog.Errorln(err)
			}
		}(ak)
	}

	s.Unlock()
	return checkNotify
}
Example #16
0
// RestoreState restores notification and alert state from the file on disk.
func (s *Schedule) RestoreState() error {
	defer func() {
		bosunStartupTime = time.Now()
	}()
	slog.Infoln("RestoreState")
	start := time.Now()
	s.Lock("RestoreState")
	defer s.Unlock()
	s.Search.Lock()
	defer s.Search.Unlock()
	s.Notifications = nil
	decode := func(name string, dst interface{}) error {
		var data []byte
		err := s.db.View(func(tx *bolt.Tx) error {
			b := tx.Bucket([]byte(dbBucket))
			if b == nil {
				return fmt.Errorf("unknown bucket: %v", dbBucket)
			}
			data = b.Get([]byte(name))
			return nil
		})
		if err != nil {
			return err
		}
		gr, err := gzip.NewReader(bytes.NewReader(data))
		if err != nil {
			return err
		}
		defer gr.Close()
		return gob.NewDecoder(gr).Decode(dst)
	}
	if err := decode(dbMetadata, &s.Metadata); err != nil {
		slog.Errorln(dbMetadata, err)
	}
	if err := decode(dbMetricMetadata, &s.metricMetadata); err != nil {
		slog.Errorln(dbMetricMetadata, err)
	}
	for k, v := range s.Metadata {
		if k.Name == "desc" || k.Name == "rate" || k.Name == "unit" {
			s.PutMetadata(k, v.Value)
			delete(s.Metadata, k)
		}
	}
	if err := decode(dbMetric, &s.Search.Metric); err != nil {
		slog.Errorln(dbMetric, err)
	}
	if err := decode(dbTagk, &s.Search.Tagk); err != nil {
		slog.Errorln(dbTagk, err)
	}
	if err := decode(dbTagv, &s.Search.Tagv); err != nil {
		slog.Errorln(dbTagv, err)
	}
	if err := decode(dbMetricTags, &s.Search.MetricTags); err != nil {
		slog.Errorln(dbMetricTags, err)
	}
	notifications := make(map[expr.AlertKey]map[string]time.Time)
	if err := decode(dbNotifications, &notifications); err != nil {
		slog.Errorln(dbNotifications, err)
	}
	if err := decode(dbSilence, &s.Silence); err != nil {
		slog.Errorln(dbSilence, err)
	}
	if err := decode(dbIncidents, &s.Incidents); err != nil {
		slog.Errorln(dbIncidents, err)
	}
	if err := decode(dbErrors, &s.AlertStatuses); err != nil {
		slog.Errorln(dbErrors, err)
	}

	// Calculate next incident id.
	for _, i := range s.Incidents {
		if i.Id > s.maxIncidentId {
			s.maxIncidentId = i.Id
		}
	}
	status := make(States)
	if err := decode(dbStatus, &status); err != nil {
		slog.Errorln(dbStatus, err)
	}
	clear := func(r *Result) {
		if r == nil {
			return
		}
		r.Computations = nil
	}
	for ak, st := range status {
		a, present := s.Conf.Alerts[ak.Name()]
		if !present {
			slog.Errorln("sched: alert no longer present, ignoring:", ak)
			continue
		} else if s.Conf.Squelched(a, st.Group) {
			slog.Infoln("sched: alert now squelched:", ak)
			continue
		} else {
			t := a.Unknown
			if t == 0 {
				t = s.Conf.CheckFrequency
			}
			if t == 0 && st.Last().Status == StUnknown {
				st.Append(&Event{Status: StNormal, IncidentId: st.Last().IncidentId})
			}
		}
		clear(st.Result)
		newHistory := []Event{}
		for _, e := range st.History {
			clear(e.Warn)
			clear(e.Crit)
			// Remove error events which no longer are a thing.
			if e.Status <= StUnknown {
				newHistory = append(newHistory, e)
			}
		}
		st.History = newHistory
		s.status[ak] = st
		if a.Log && st.Open {
			st.Open = false
			slog.Infof("sched: alert %s is now log, closing, was %s", ak, st.Status())
		}
		for name, t := range notifications[ak] {
			n, present := s.Conf.Notifications[name]
			if !present {
				slog.Infoln("sched: notification not present during restore:", name)
				continue
			}
			if a.Log {
				slog.Infoln("sched: alert is now log, removing notification:", ak)
				continue
			}
			s.AddNotification(ak, n, t)
		}
	}
	if s.maxIncidentId == 0 {
		s.createHistoricIncidents()
	}

	s.Search.Copy()
	slog.Infoln("RestoreState done in", time.Since(start))
	return nil
}
Example #17
0
File: conf.go Project: rajder/bosun
func (c *Conf) loadNotification(s *parse.SectionNode) {
	name := s.Name.Text
	if _, ok := c.Notifications[name]; ok {
		c.errorf("duplicate notification name: %s", name)
	}
	n := Notification{
		Vars:         make(map[string]string),
		ContentType:  "application/x-www-form-urlencoded",
		Name:         name,
		RunOnActions: true,
	}
	n.Text = s.RawText
	funcs := ttemplate.FuncMap{
		"V": func(v string) string {
			return c.Expand(v, n.Vars, false)
		},
		"json": func(v interface{}) string {
			b, err := json.Marshal(v)
			if err != nil {
				slog.Errorln(err)
			}
			return string(b)
		},
	}
	c.Notifications[name] = &n
	pairs := c.getPairs(s, n.Vars, sNormal)
	for _, p := range pairs {
		c.at(p.node)
		v := p.val
		switch k := p.key; k {
		case "email":
			if c.SMTPHost == "" || c.EmailFrom == "" {
				c.errorf("email notifications require both smtpHost and emailFrom to be set")
			}
			n.email = v
			email, err := mail.ParseAddressList(n.email)
			if err != nil {
				c.error(err)
			}
			n.Email = email
		case "post":
			n.post = v
			post, err := url.Parse(n.post)
			if err != nil {
				c.error(err)
			}
			n.Post = post
		case "get":
			n.get = v
			get, err := url.Parse(n.get)
			if err != nil {
				c.error(err)
			}
			n.Get = get
		case "print":
			n.Print = true
		case "contentType":
			n.ContentType = v
		case "next":
			n.next = v
			next, ok := c.Notifications[n.next]
			if !ok {
				c.errorf("unknown notification %s", n.next)
			}
			n.Next = next
		case "timeout":
			d, err := opentsdb.ParseDuration(v)
			if err != nil {
				c.error(err)
			}
			n.Timeout = time.Duration(d)
		case "body":
			n.body = v
			tmpl := ttemplate.New(name).Funcs(funcs)
			_, err := tmpl.Parse(n.body)
			if err != nil {
				c.error(err)
			}
			n.Body = tmpl
		case "runOnActions":
			n.RunOnActions = v == "true"
		default:
			c.errorf("unknown key %s", k)
		}
	}
	c.at(s)
	if n.Timeout > 0 && n.Next == nil {
		c.errorf("timeout specified without next")
	}
}
Example #18
0
func (s *Schedule) CheckExpr(T miniprofiler.Timer, rh *RunHistory, a *conf.Alert, e *expr.Expr, checkStatus Status, ignore expr.AlertKeys) (alerts expr.AlertKeys, err error) {
	if e == nil {
		return
	}
	defer func() {
		if err == nil {
			return
		}
		collect.Add("check.errs", opentsdb.TagSet{"metric": a.Name}, 1)
		slog.Errorln(err)
	}()
	results, err := s.executeExpr(T, rh, a, e)
	if err != nil {
		return nil, err
	}
Loop:
	for _, r := range results.Results {
		if s.Conf.Squelched(a, r.Group) {
			continue
		}
		ak := expr.NewAlertKey(a.Name, r.Group)
		for _, v := range ignore {
			if ak == v {
				continue Loop
			}
		}
		var n float64
		switch v := r.Value.(type) {
		case expr.Number:
			n = float64(v)
		case expr.Scalar:
			n = float64(v)
		default:
			err = fmt.Errorf("expected number or scalar")
			return
		}
		event := rh.Events[ak]
		if event == nil {
			event = new(Event)
			rh.Events[ak] = event
		}
		result := &Result{
			Result: r,
			Expr:   e.String(),
		}
		switch checkStatus {
		case StWarning:
			event.Warn = result
		case StCritical:
			event.Crit = result
		}
		status := checkStatus
		if math.IsNaN(n) {
			status = StError
		} else if n == 0 {
			status = StNormal
		}
		if status != StNormal {
			alerts = append(alerts, ak)
		}
		if status > rh.Events[ak].Status {
			event.Status = status
		}
	}
	return
}
Example #19
0
// CollectStates sends various state information to bosun with collect.
func (s *Schedule) CollectStates() {
	// [AlertName][Severity]Count
	severityCounts := make(map[string]map[string]int64)
	abnormalCounts := make(map[string]map[string]int64)
	ackStatusCounts := make(map[string]map[bool]int64)
	activeStatusCounts := make(map[string]map[bool]int64)
	// Initalize the Counts
	for _, alert := range s.Conf.Alerts {
		severityCounts[alert.Name] = make(map[string]int64)
		abnormalCounts[alert.Name] = make(map[string]int64)
		var i Status
		for i = 1; i.String() != "none"; i++ {
			severityCounts[alert.Name][i.String()] = 0
			abnormalCounts[alert.Name][i.String()] = 0
		}
		ackStatusCounts[alert.Name] = make(map[bool]int64)
		activeStatusCounts[alert.Name] = make(map[bool]int64)
		ackStatusCounts[alert.Name][false] = 0
		activeStatusCounts[alert.Name][false] = 0
		ackStatusCounts[alert.Name][true] = 0
		activeStatusCounts[alert.Name][true] = 0
	}
	for _, state := range s.status {
		if !state.Open {
			continue
		}
		severity := state.Status().String()
		lastAbnormal := state.AbnormalStatus().String()
		severityCounts[state.Alert][severity]++
		abnormalCounts[state.Alert][lastAbnormal]++
		ackStatusCounts[state.Alert][state.NeedAck]++
		activeStatusCounts[state.Alert][state.IsActive()]++
	}
	for alertName := range severityCounts {
		ts := opentsdb.TagSet{"alert": alertName}
		// The tagset of the alert is not included because there is no way to
		// store the string of a group in OpenTSBD in a parsable way. This is
		// because any delimiter we chose could also be part of a tag key or tag
		// value.
		for severity := range severityCounts[alertName] {
			err := collect.Put("alerts.current_severity",
				ts.Copy().Merge(opentsdb.TagSet{"severity": severity}),
				severityCounts[alertName][severity])
			if err != nil {
				slog.Errorln(err)
			}
			err = collect.Put("alerts.last_abnormal_severity",
				ts.Copy().Merge(opentsdb.TagSet{"severity": severity}),
				abnormalCounts[alertName][severity])
			if err != nil {
				slog.Errorln(err)
			}
		}
		err := collect.Put("alerts.acknowledgement_status",
			ts.Copy().Merge(opentsdb.TagSet{"status": "unacknowledged"}),
			ackStatusCounts[alertName][true])
		err = collect.Put("alerts.acknowledgement_status",
			ts.Copy().Merge(opentsdb.TagSet{"status": "acknowledged"}),
			ackStatusCounts[alertName][false])
		if err != nil {
			slog.Errorln(err)
		}
		err = collect.Put("alerts.active_status",
			ts.Copy().Merge(opentsdb.TagSet{"status": "active"}),
			activeStatusCounts[alertName][true])
		if err != nil {
			slog.Errorln(err)
		}
		err = collect.Put("alerts.active_status",
			ts.Copy().Merge(opentsdb.TagSet{"status": "inactive"}),
			activeStatusCounts[alertName][false])
		if err != nil {
			slog.Errorln(err)
		}
	}
}
Example #20
0
func (s *Schedule) CheckExpr(T miniprofiler.Timer, rh *RunHistory, a *conf.Alert, e *expr.Expr, checkStatus models.Status, ignore models.AlertKeys) (alerts models.AlertKeys, err error, cancelled bool) {
	if e == nil {
		return
	}
	defer func() {
		if err == nil {
			return
		}
		collect.Add("check.errs", opentsdb.TagSet{"metric": a.Name}, 1)
		slog.Errorln(err)
	}()
	type res struct {
		results *expr.Results
		error   error
	}
	// See s.CheckAlert for an explanation of execution and cancellation with this channel
	rc := make(chan res, 1)
	var results *expr.Results
	go func() {
		results, err := s.executeExpr(T, rh, a, e)
		rc <- res{results, err}
	}()
	select {
	case res := <-rc:
		results = res.results
		err = res.error
	case <-s.runnerContext.Done():
		return nil, nil, true
	}
	if err != nil {
		return
	}
Loop:
	for _, r := range results.Results {
		if s.RuleConf.Squelched(a, r.Group) {
			continue
		}
		ak := models.NewAlertKey(a.Name, r.Group)
		for _, v := range ignore {
			if ak == v {
				continue Loop
			}
		}
		var n float64
		n, err = valueToFloat(r.Value)
		if err != nil {
			return
		}
		event := rh.Events[ak]
		if event == nil {
			event = new(models.Event)
			rh.Events[ak] = event
		}
		result := &models.Result{
			Computations: r.Computations,
			Value:        models.Float(n),
			Expr:         e.String(),
		}
		switch checkStatus {
		case models.StWarning:
			event.Warn = result
		case models.StCritical:
			event.Crit = result
		}
		status := checkStatus
		if math.IsNaN(n) {
			status = checkStatus
		} else if n == 0 {
			status = models.StNormal
		}
		if status != models.StNormal {
			alerts = append(alerts, ak)
		}
		if status > rh.Events[ak].Status {
			event.Status = status
		}
	}
	return
}
Example #21
0
func c_procstats_linux() (opentsdb.MultiDataPoint, error) {
	var md opentsdb.MultiDataPoint
	var Error error
	var sys unix.Sysinfo_t

	unix.Sysinfo(&sys)

	Add(&md, "linux.uptime_total", sys.Uptime, nil, metadata.Gauge, metadata.Second, osSystemUptimeDesc)
	Add(&md, osSystemUptime, sys.Uptime, nil, metadata.Gauge, metadata.Second, osSystemUptimeDesc)
	if err := readLine("/proc/meminfo", func(s string) error {
		s = strings.TrimSuffix(s, " kB")
		m := strings.Split(s, ":")
		if m == nil {
			return nil
		}
		m[1] = strings.TrimSpace(m[1])
		Add(&md, "linux.mem."+strings.ToLower(m[0]), m[1], nil, metadata.Gauge, metadata.KBytes, "")
		return nil
	}); err != nil {
		Error = err
		slog.Errorln(err)
	}
	Add(&md, osMemTotal, sys.Totalram*uint64(sys.Unit), nil, metadata.Gauge, metadata.Bytes, osMemTotalDesc)
	Add(&md, osMemFree, sys.Freeram*uint64(sys.Unit), nil, metadata.Gauge, metadata.Bytes, osMemFreeDesc)
	Add(&md, osMemUsed, (sys.Totalram-sys.Freeram)*uint64(sys.Unit), nil, metadata.Gauge, metadata.Bytes, osMemUsedDesc)
	Add(&md, "linux.loadavg.1_min", sys.Loads[0], nil, metadata.Gauge, metadata.Load, "")
	Add(&md, "linux.loadavg.5_min", sys.Loads[1], nil, metadata.Gauge, metadata.Load, "")
	Add(&md, "linux.loadavg.15_min", sys.Loads[2], nil, metadata.Gauge, metadata.Load, "")
	Add(&md, "linux.loadavg.total_threads", sys.Procs, nil, metadata.Gauge, metadata.Process, "")
	if sys.Totalram != 0 {
		Add(&md, osMemPctFree, (sys.Freeram)/(sys.Totalram)*100, nil, metadata.Gauge, metadata.Pct, osMemFreeDesc)
	}
	if err := readLine("/proc/vmstat", func(s string) error {
		m := strings.Split(s, " ")
		if m == nil {
			return nil
		}
		switch m[0] {
		case "pgpgin", "pgpgout", "pswpin", "pswpout":
			switch {
			case strings.HasSuffix(m[0], "in"):
				Add(&md, "linux.mem."+strings.TrimSuffix(m[0], "in"), m[1], opentsdb.TagSet{"direction": "in"}, metadata.Counter, metadata.Page, "")
			case strings.HasSuffix(m[0], "out"):
				Add(&md, "linux.mem."+strings.TrimSuffix(m[0], "out"), m[1], opentsdb.TagSet{"direction": "out"}, metadata.Counter, metadata.Page, "")
			}
		case "pgfault", "pgmajfault":
			Add(&md, "linux.mem."+m[0], m[1], nil, metadata.Counter, metadata.Page, "")
		default:
			Add(&md, "linux.mem."+m[0], m[1], nil, metadata.Counter, metadata.None, "")
		}
		return nil
	}); err != nil {
		slog.Errorln(err)
		Error = err
	}
	num_cores := 0
	var t_util int
	if err := readLine("/proc/stat", func(s string) error {
		m := strings.Fields(s)
		if m == nil {
			return nil
		}
		switch {
		case strings.HasPrefix(m[0], "cpu"):
			tag_cpu := strings.TrimPrefix(m[0], "cpu")
			if tag_cpu != "" {
				num_cores++
			}
			for i, value := range m[1:] {
				if i >= len(cpu_fields) {
					break
				}
				tags := opentsdb.TagSet{
					"type": cpu_fields[i],
				}
				if tag_cpu != "" {
					tags["cpu"] = tag_cpu
					Add(&md, "linux.cpu.percpu", value, tags, metadata.Counter, metadata.CHz, cpu_stat_desc[i])
				} else {
					Add(&md, "linux.cpu", value, tags, metadata.Counter, metadata.CHz, cpu_stat_desc[i])
				}
			}
			if tag_cpu == "" {
				if len(m[1:]) < 3 {
					return nil
				}
				user, err := strconv.Atoi(m[1])
				if err != nil {
					slog.Errorln(err)
					return nil
				}
				nice, err := strconv.Atoi(m[2])
				if err != nil {
					slog.Errorln(err)
					return nil
				}
				system, err := strconv.Atoi(m[3])
				if err != nil {
					slog.Errorln(err)
					return nil
				}
				t_util = user + nice + system
			}
		case m[0] == "intr":
			Add(&md, "linux.intr", m[1], nil, metadata.Counter, metadata.Interupt, "")
		case m[0] == "ctxt":
			Add(&md, "linux.ctxt", m[1], nil, metadata.Counter, metadata.ContextSwitch, "")
		case m[0] == "processes":
			Add(&md, "linux.processes", m[1], nil, metadata.Counter, metadata.Process,
				"The number  of processes and threads created, which includes (but  is not limited  to) those  created by  calls to the  fork() and clone() system calls.")
		case m[0] == "procs_blocked":
			Add(&md, "linux.procs_blocked", m[1], nil, metadata.Gauge, metadata.Process, "The  number of  processes currently blocked, waiting for I/O to complete.")
		}
		return nil
	}); err != nil {
		slog.Errorln(err)
		Error = err
	}
	if num_cores != 0 && t_util != 0 {
		Add(&md, osCPU, t_util/num_cores, nil, metadata.Counter, metadata.Pct, "")
	}
	cpuinfo_index := 0
	if err := readLine("/proc/cpuinfo", func(s string) error {
		m := strings.Split(s, ":")
		if len(m) < 2 {
			return nil
		}
		m[0] = strings.TrimSpace(m[0])
		m[1] = strings.TrimSpace(m[1])
		if m[0] != "cpu MHz" {
			return nil
		}
		tags := opentsdb.TagSet{"cpu": strconv.Itoa(cpuinfo_index)}
		Add(&md, osCPUClock, m[1], tags, metadata.Gauge, metadata.MHz, osCPUClockDesc)
		Add(&md, "linux.cpu.clock", m[1], tags, metadata.Gauge, metadata.MHz, osCPUClockDesc)
		cpuinfo_index++
		return nil
	}); err != nil {
		slog.Errorln(err)
		Error = err
	}
	if err := readLine("/proc/sys/kernel/random/entropy_avail", func(s string) error {
		Add(&md, "linux.entropy_avail", strings.TrimSpace(s), nil, metadata.Gauge, metadata.Entropy, "The remaing amount of entropy available to the system. If it is low or hitting zero processes might be blocked waiting for extropy")
		return nil
	}); err != nil {
		slog.Errorln(err)
		Error = err
	}
	irq_type_desc := map[string]string{
		"NMI": "Non-maskable interrupts.",
		"LOC": "Local timer interrupts.",
		"SPU": "Spurious interrupts.",
		"PMI": "Performance monitoring interrupts.",
		"IWI": "IRQ work interrupts.",
		"RES": "Rescheduling interrupts.",
		"CAL": "Funcation call interupts.",
		"TLB": "TLB (translation lookaside buffer) shootdowns.",
		"TRM": "Thermal event interrupts.",
		"THR": "Threshold APIC interrupts.",
		"MCE": "Machine check exceptions.",
		"MCP": "Machine Check polls.",
	}
	num_cpus := 0
	if err := readLine("/proc/interrupts", func(s string) error {
		cols := strings.Fields(s)
		if num_cpus == 0 {
			num_cpus = len(cols)
			return nil
		} else if len(cols) < 2 {
			return nil
		}
		tags := opentsdb.TagSet{}
		irq := strings.TrimRight(cols[0], ":")
		tags["irq"] = irq
		if len(cols) == 2 {
			Add(&md, "linux.interrupts", cols[1], tags, metadata.Counter, metadata.Interupt, "")
			return nil
		}
		device := ""
		if _, err := strconv.Atoi(irq); err == nil {
			if len(cols) >= num_cpus+3 && (strings.HasPrefix(cols[num_cpus+1], "IR-") || strings.HasPrefix(cols[num_cpus+1], "IO-") || strings.HasPrefix(cols[num_cpus+1], "PCI-")) {
				device = strings.ToLower(strings.Join([]string{"linux.interrupts", cols[num_cpus+1]}, "."))
				tags["dev"] = strings.Join(cols[len(cols)-1:], " ")
			}
		}
		for i, val := range cols[1:num_cpus] {
			tags["cpu"] = strconv.Itoa(i)
			if device == "" {
				Add(&md, "linux.interrupts", val, tags, metadata.Counter, metadata.Interupt, irq_type_desc[irq])
			} else {
				Add(&md, device, val, tags, metadata.Counter, metadata.Interupt, irq_type_desc[irq])
			}
		}
		return nil
	}); err != nil {
		slog.Errorln(err)
		Error = err
	}
	if err := readLine("/proc/net/sockstat", func(s string) error {
		cols := strings.Fields(s)
		switch cols[0] {
		case "sockets:":
			if len(cols) < 3 {
				return fmt.Errorf("sockstat: error parsing sockets line")
			}
			Add(&md, "linux.net.sockets.used", cols[2], nil, metadata.Gauge, metadata.Socket, "")
		case "TCP:":
			if len(cols) < 11 {
				return fmt.Errorf("sockstat: error parsing tcp line")
			}
			Add(&md, "linux.net.sockets.tcp_in_use", cols[2], nil, metadata.Gauge, metadata.Socket, "")
			Add(&md, "linux.net.sockets.tcp_orphaned", cols[4], nil, metadata.Gauge, metadata.Socket, "")
			Add(&md, "linux.net.sockets.tcp_time_wait", cols[6], nil, metadata.Gauge, metadata.Socket, "")
			Add(&md, "linux.net.sockets.tcp_allocated", cols[8], nil, metadata.Gauge, metadata.None, "")
			Add(&md, "linux.net.sockets.tcp_mem", cols[10], nil, metadata.Gauge, metadata.None, "")
		case "UDP:":
			if len(cols) < 5 {
				return fmt.Errorf("sockstat: error parsing udp line")
			}
			Add(&md, "linux.net.sockets.udp_in_use", cols[2], nil, metadata.Gauge, metadata.Socket, "")
			Add(&md, "linux.net.sockets.udp_mem", cols[4], nil, metadata.Gauge, metadata.Page, "")
		case "UDPLITE:":
			if len(cols) < 3 {
				return fmt.Errorf("sockstat: error parsing udplite line")
			}
			Add(&md, "linux.net.sockets.udplite_in_use", cols[2], nil, metadata.Gauge, metadata.Socket, "")
		case "RAW:":
			if len(cols) < 3 {
				return fmt.Errorf("sockstat: error parsing raw line")
			}
			Add(&md, "linux.net.sockets.raw_in_use", cols[2], nil, metadata.Gauge, metadata.Socket, "")
		case "FRAG:":
			if len(cols) < 5 {
				return fmt.Errorf("sockstat: error parsing frag line")
			}
			Add(&md, "linux.net.sockets.frag_in_use", cols[2], nil, metadata.Gauge, metadata.Socket, "")
			Add(&md, "linux.net.sockets.frag_mem", cols[4], nil, metadata.Gauge, metadata.Bytes, "")
		}
		return nil
	}); err != nil {
		slog.Errorln(err)
		Error = err
	}
	ln := 0
	var headers []string
	if err := readLine("/proc/net/netstat", func(s string) error {
		cols := strings.Fields(s)
		if ln%2 == 0 {
			headers = cols
		} else {
			if len(cols) < 1 || len(cols) != len(headers) {
				return fmt.Errorf("netstat: parsing failed")
			}
			root := strings.ToLower(strings.TrimSuffix(headers[0], "Ext:"))
			for i, v := range cols[1:] {
				i++
				m := "linux.net.stat." + root + "." + strings.TrimPrefix(strings.ToLower(headers[i]), "tcp")
				Add(&md, m, v, nil, metadata.Counter, metadata.None, "")
			}
		}
		ln++
		return nil
	}); err != nil {
		slog.Errorln(err)
		Error = err
	}
	ln = 0
	if err := readLine("/proc/net/snmp", func(s string) error {
		ln++
		if ln%2 != 0 {
			f := strings.Fields(s)
			if len(f) < 2 {
				return fmt.Errorf("Failed to parse header line")
			}
			headers = f
		} else {
			values := strings.Fields(s)
			if len(values) != len(headers) {
				return fmt.Errorf("Mismatched header and value length")
			}
			proto := strings.ToLower(strings.TrimSuffix(values[0], ":"))
			for i, v := range values {
				if i == 0 {
					continue
				}
				var stype metadata.RateType = metadata.Counter
				stat := strings.ToLower(headers[i])
				if strings.HasPrefix(stat, "rto") {
					stype = metadata.Gauge
				}
				Add(&md, "linux.net.stat."+proto+"."+stat, v, nil, stype, metadata.None, "")
			}
		}
		return nil
	}); err != nil {
		slog.Errorln(err)
		Error = err
	}
	if err := readLine("/proc/sys/fs/file-nr", func(s string) error {
		f := strings.Fields(s)
		if len(f) != 3 {
			return fmt.Errorf("unexpected number of fields")
		}
		v, err := strconv.ParseInt(f[0], 10, 64)
		if err != nil {
			slog.Errorln(err)
			return err
		}
		Add(&md, "linux.fs.open", v, nil, metadata.Gauge, metadata.Count, "The number of files presently open.")
		return nil
	}); err != nil {
		slog.Errorln(err)
		Error = err
	}
	return md, Error
}
Example #22
0
func c_varnish_unix() (opentsdb.MultiDataPoint, error) {
	var md opentsdb.MultiDataPoint
	const metric = "varnish."

	r, err := util.Command(5*time.Second, nil, "varnishstat", "-j")
	if err != nil {
		return nil, err
	}

	var stats varnishStats
	if err := json.NewDecoder(r).Decode(&stats); err != nil {
		return nil, err
	}

	for name, raw := range stats {
		if name == "timestamp" {
			continue
		}

		var v varnishStat
		if err := json.Unmarshal(raw, &v); err != nil {
			slog.Errorln("varnish parser error:", name, err)
			continue
		}

		ts := opentsdb.TagSet{}

		// special case for backend stats. extract backend name, host and port, put
		// them in tags and remove them in name.
		// the format is like "name(host,,port)" for the "ident" field of "VBE" type
		if v.Type == "VBE" {
			subtype := v.SubType

			name = strings.Replace(name, "."+subtype, "", -1)

			idx := strings.Index(subtype, "(")
			if idx < 0 || len(subtype)-idx < 4 {
				// output format changed, ignore
				continue
			}

			ss := strings.Split(subtype[idx+1:len(subtype)-1], ",")
			if len(ss) != 3 {
				// output format changed, ignore
				continue
			}

			ts.Merge(opentsdb.TagSet{"backend": subtype[:idx]})
			ts.Merge(opentsdb.TagSet{"endpoint": ss[0] + "_" + ss[2]})
		}

		rate := metadata.RateType(metadata.Gauge)
		if flag := v.Flag; flag == "a" || flag == "c" {
			rate = metadata.Counter
		}

		unit := metadata.Unit(metadata.Count)
		if v.Format == "B" {
			unit = metadata.Bytes
		}

		Add(&md, metric+strings.ToLower(name), v.Value, ts, rate, unit, v.Desc)
	}
	return md, nil
}
Example #23
0
// CollectStates sends various state information to bosun with collect.
func (s *Schedule) CollectStates() {
	// [AlertName][Severity]Count
	severityCounts := make(map[string]map[string]int64)
	abnormalCounts := make(map[string]map[string]int64)
	ackStatusCounts := make(map[string]map[bool]int64)
	ackByNotificationCounts := make(map[string]map[bool]int64)
	unAckOldestByNotification := make(map[string]time.Time)
	activeStatusCounts := make(map[string]map[bool]int64)
	// Initalize the Counts
	for _, alert := range s.Conf.Alerts {
		severityCounts[alert.Name] = make(map[string]int64)
		abnormalCounts[alert.Name] = make(map[string]int64)
		var i models.Status
		for i = 1; i.String() != "none"; i++ {
			severityCounts[alert.Name][i.String()] = 0
			abnormalCounts[alert.Name][i.String()] = 0
		}
		ackStatusCounts[alert.Name] = make(map[bool]int64)
		activeStatusCounts[alert.Name] = make(map[bool]int64)
		ackStatusCounts[alert.Name][false] = 0
		activeStatusCounts[alert.Name][false] = 0
		ackStatusCounts[alert.Name][true] = 0
		activeStatusCounts[alert.Name][true] = 0
	}
	for notificationName := range s.Conf.Notifications {
		unAckOldestByNotification[notificationName] = time.Unix(1<<63-62135596801, 999999999)
		ackByNotificationCounts[notificationName] = make(map[bool]int64)
		ackByNotificationCounts[notificationName][false] = 0
		ackByNotificationCounts[notificationName][true] = 0
	}
	//TODO:
	//	for _, state := range s.status {
	//		if !state.Open {
	//			continue
	//		}
	//		name := state.AlertKey.Name()
	//		alertDef := s.Conf.Alerts[name]
	//		nots := make(map[string]bool)
	//		for name := range alertDef.WarnNotification.Get(s.Conf, state.Group) {
	//			nots[name] = true
	//		}
	//		for name := range alertDef.CritNotification.Get(s.Conf, state.Group) {
	//			nots[name] = true
	//		}
	//		incident, err := s.GetIncident(state.Last().IncidentId)
	//		if err != nil {
	//			slog.Errorln(err)
	//		}
	//		for notificationName := range nots {
	//			ackByNotificationCounts[notificationName][state.NeedAck]++
	//			if incident != nil && incident.Start.Before(unAckOldestByNotification[notificationName]) && state.NeedAck {
	//				unAckOldestByNotification[notificationName] = incident.Start
	//			}
	//		}
	//		severity := state.CurrentStatus.String()
	//		lastAbnormal := state.LastAbnormalStatus.String()
	//		severityCounts[state.Alert][severity]++
	//		abnormalCounts[state.Alert][lastAbnormal]++
	//		ackStatusCounts[state.Alert][state.NeedAck]++
	//		activeStatusCounts[state.Alert][state.IsActive()]++
	//	}
	for notification := range ackByNotificationCounts {
		ts := opentsdb.TagSet{"notification": notification}
		err := collect.Put("alerts.acknowledgement_status_by_notification",
			ts.Copy().Merge(opentsdb.TagSet{"status": "unacknowledged"}),
			ackByNotificationCounts[notification][true])
		if err != nil {
			slog.Errorln(err)
		}
		err = collect.Put("alerts.acknowledgement_status_by_notification",
			ts.Copy().Merge(opentsdb.TagSet{"status": "acknowledged"}),
			ackByNotificationCounts[notification][false])
		if err != nil {
			slog.Errorln(err)
		}
	}
	for notification, timeStamp := range unAckOldestByNotification {
		ts := opentsdb.TagSet{"notification": notification}
		var ago time.Duration
		if !timeStamp.Equal(time.Unix(1<<63-62135596801, 999999999)) {
			ago = utcNow().Sub(timeStamp)
		}
		err := collect.Put("alerts.oldest_unacked_by_notification",
			ts,
			ago.Seconds())
		if err != nil {
			slog.Errorln(err)
		}
	}
	for alertName := range severityCounts {
		ts := opentsdb.TagSet{"alert": alertName}
		// The tagset of the alert is not included because there is no way to
		// store the string of a group in OpenTSBD in a parsable way. This is
		// because any delimiter we chose could also be part of a tag key or tag
		// value.
		for severity := range severityCounts[alertName] {
			err := collect.Put("alerts.current_severity",
				ts.Copy().Merge(opentsdb.TagSet{"severity": severity}),
				severityCounts[alertName][severity])
			if err != nil {
				slog.Errorln(err)
			}
			err = collect.Put("alerts.last_abnormal_severity",
				ts.Copy().Merge(opentsdb.TagSet{"severity": severity}),
				abnormalCounts[alertName][severity])
			if err != nil {
				slog.Errorln(err)
			}
		}
		err := collect.Put("alerts.acknowledgement_status",
			ts.Copy().Merge(opentsdb.TagSet{"status": "unacknowledged"}),
			ackStatusCounts[alertName][true])
		err = collect.Put("alerts.acknowledgement_status",
			ts.Copy().Merge(opentsdb.TagSet{"status": "acknowledged"}),
			ackStatusCounts[alertName][false])
		if err != nil {
			slog.Errorln(err)
		}
		err = collect.Put("alerts.active_status",
			ts.Copy().Merge(opentsdb.TagSet{"status": "active"}),
			activeStatusCounts[alertName][true])
		if err != nil {
			slog.Errorln(err)
		}
		err = collect.Put("alerts.active_status",
			ts.Copy().Merge(opentsdb.TagSet{"status": "inactive"}),
			activeStatusCounts[alertName][false])
		if err != nil {
			slog.Errorln(err)
		}
	}
}
Example #24
0
func c_cisco_ios(host, community string, cpuIntegrator tsIntegrator) (opentsdb.MultiDataPoint, error) {
	var md opentsdb.MultiDataPoint
	ts := opentsdb.TagSet{"host": host}
	// CPU
	if err := ciscoCPU(host, community, ts, cpuIntegrator, &md); err != nil {
		return md, err
	}
	// ÎMemory
	memRaw, err := snmp_subtree(host, community, ciscoBaseOID+ciscoMemoryPoolTable)
	if err != nil {
		return md, fmt.Errorf("failed to get ciscoMemoryPoolTable for host %v: %v", host, err)
	}
	idToPoolEntry := make(map[string]*ciscoMemoryPoolEntry)
	for id, value := range memRaw {
		sp := strings.SplitN(id, ".", 2)
		if len(sp) != 2 {
			slog.Errorln("unexpected length of snmp sub OID (%v) for ciscoMemoryPoolTable for host %v: %v", id, host)
		}
		columnID := sp[0]
		entryID := sp[1]
		if _, ok := idToPoolEntry[entryID]; !ok {
			idToPoolEntry[entryID] = &ciscoMemoryPoolEntry{}
		}
		switch columnID {
		case "2":
			if v, ok := value.([]byte); ok {
				if m, ok := idToPoolEntry[entryID]; ok {
					m.PoolType = string(v)
				} else {
					slog.Errorf("failed to find cisco memory pool entry for entry id %v on host %v for memory pool type", entryID, host)
				}
			} else {
				slog.Errorf("failed to convert memory pool label %v to []byte for host %v", value, host)
			}
		case "5":
			if v, ok := value.(int64); ok {
				if m, ok := idToPoolEntry[entryID]; ok {
					m.Used = v
				} else {
					slog.Errorf("failed to find cisco memory pool entry for entry id %v on host %v for used memory", entryID, host)
				}
			} else {
				slog.Errorf("failed to convert used memory value %v to int64 for host %v", value, host)
			}
		case "6":
			if v, ok := value.(int64); ok {
				if m, ok := idToPoolEntry[entryID]; ok {
					m.Free = v
				} else {
					slog.Errorf("failed to find cisco memory pool entry for entry id %v on host %v for free memory", entryID, host)
				}
			} else {
				slog.Errorf("failed to convert used memory value %v to int64 for host %v", value, host)
			}
		}
	}
	var totalFreeMem int64
	var totalUsedMem int64
	for _, entry := range idToPoolEntry {
		ts := ts.Copy().Merge(opentsdb.TagSet{"name": entry.PoolType})
		Add(&md, "cisco.mem.used", entry.Used, ts, metadata.Gauge, metadata.Bytes, ciscoMemoryPoolUsedDesc)
		Add(&md, "cisco.mem.free", entry.Free, ts, metadata.Gauge, metadata.Bytes, ciscoMemoryPoolFreeDesc)
		totalFreeMem += entry.Free
		totalUsedMem += entry.Used
	}
	Add(&md, osMemFree, totalFreeMem, ts, metadata.Gauge, metadata.Bytes, osMemFreeDesc)
	Add(&md, osMemUsed, totalUsedMem, ts, metadata.Gauge, metadata.Bytes, osMemUsedDesc)
	totalMem := totalFreeMem + totalUsedMem
	Add(&md, osMemTotal, totalMem, ts, metadata.Gauge, metadata.Bytes, osMemTotalDesc)
	Add(&md, osMemPctFree, int64(float64(totalFreeMem)/float64(totalMem)*100), ts, metadata.Gauge, metadata.Pct, osMemPctFreeDesc)
	return md, nil
}
Example #25
0
// RestoreState restores notification and alert state from the file on disk.
func (s *Schedule) RestoreState() error {
	defer func() {
		bosunStartupTime = time.Now()
	}()
	slog.Infoln("RestoreState")
	start := time.Now()
	s.Lock("RestoreState")
	defer s.Unlock()
	s.Search.Lock()
	defer s.Search.Unlock()

	s.Notifications = nil
	db := s.db
	notifications := make(map[expr.AlertKey]map[string]time.Time)
	if err := decode(db, dbNotifications, &notifications); err != nil {
		slog.Errorln(dbNotifications, err)
	}
	if err := decode(db, dbSilence, &s.Silence); err != nil {
		slog.Errorln(dbSilence, err)
	}
	if err := decode(db, dbIncidents, &s.Incidents); err != nil {
		slog.Errorln(dbIncidents, err)
	}

	// Calculate next incident id.
	for _, i := range s.Incidents {
		if i.Id > s.maxIncidentId {
			s.maxIncidentId = i.Id
		}
	}
	status := make(States)
	if err := decode(db, dbStatus, &status); err != nil {
		slog.Errorln(dbStatus, err)
	}
	clear := func(r *Result) {
		if r == nil {
			return
		}
		r.Computations = nil
	}
	for ak, st := range status {
		a, present := s.Conf.Alerts[ak.Name()]
		if !present {
			slog.Errorln("sched: alert no longer present, ignoring:", ak)
			continue
		} else if s.Conf.Squelched(a, st.Group) {
			slog.Infoln("sched: alert now squelched:", ak)
			continue
		} else {
			t := a.Unknown
			if t == 0 {
				t = s.Conf.CheckFrequency
			}
			if t == 0 && st.Last().Status == StUnknown {
				st.Append(&Event{Status: StNormal, IncidentId: st.Last().IncidentId})
			}
		}
		clear(st.Result)
		newHistory := []Event{}
		for _, e := range st.History {
			clear(e.Warn)
			clear(e.Crit)
			// Remove error events which no longer are a thing.
			if e.Status <= StUnknown {
				newHistory = append(newHistory, e)
			}
		}
		st.History = newHistory
		s.status[ak] = st
		if a.Log && st.Open {
			st.Open = false
			slog.Infof("sched: alert %s is now log, closing, was %s", ak, st.Status())
		}
		for name, t := range notifications[ak] {
			n, present := s.Conf.Notifications[name]
			if !present {
				slog.Infoln("sched: notification not present during restore:", name)
				continue
			}
			if a.Log {
				slog.Infoln("sched: alert is now log, removing notification:", ak)
				continue
			}
			s.AddNotification(ak, n, t)
		}
	}
	if s.maxIncidentId == 0 {
		s.createHistoricIncidents()
	}
	migrateOldDataToRedis(db, s.DataAccess)
	// delete metrictags if they exist.
	deleteKey(s.db, "metrictags")
	slog.Infoln("RestoreState done in", time.Since(start))
	return nil
}
Example #26
0
func sendBatch(batch []json.RawMessage) {
	if Print {
		for _, d := range batch {
			slog.Info(string(d))
		}
		recordSent(len(batch))
		return
	}
	var buf bytes.Buffer
	g := gzip.NewWriter(&buf)
	if err := json.NewEncoder(g).Encode(batch); err != nil {
		slog.Error(err)
		return
	}
	if err := g.Close(); err != nil {
		slog.Error(err)
		return
	}
	req, err := http.NewRequest("POST", tsdbURL, &buf)
	if err != nil {
		slog.Error(err)
		return
	}
	req.Header.Set("Content-Type", "application/json")
	req.Header.Set("Content-Encoding", "gzip")
	now := time.Now()
	resp, err := client.Do(req)
	d := time.Since(now).Nanoseconds() / 1e6
	if err == nil {
		defer resp.Body.Close()
	}
	Add("collect.post.total_duration", Tags, d)
	Add("collect.post.count", Tags, 1)
	// Some problem with connecting to the server; retry later.
	if err != nil || resp.StatusCode != http.StatusNoContent {
		if err != nil {
			Add("collect.post.error", Tags, 1)
			slog.Error(err)
		} else if resp.StatusCode != http.StatusNoContent {
			Add("collect.post.bad_status", Tags, 1)
			slog.Errorln(resp.Status)
			body, err := ioutil.ReadAll(resp.Body)
			if err != nil {
				slog.Error(err)
			}
			if len(body) > 0 {
				slog.Error(string(body))
			}
		}
		restored := 0
		for _, msg := range batch {
			var dp opentsdb.DataPoint
			if err := json.Unmarshal(msg, &dp); err != nil {
				slog.Error(err)
				continue
			}
			restored++
			tchan <- &dp
		}
		d := time.Second * 5
		Add("collect.post.restore", Tags, int64(restored))
		slog.Infof("restored %d, sleeping %s", restored, d)
		time.Sleep(d)
		return
	}
	recordSent(len(batch))
}
Example #27
0
// RunHistory for a single alert key. Returns true if notifications were altered.
func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.Event, silenced SilenceTester) (checkNotify bool, err error) {
	event.Time = r.Start
	a := s.Conf.Alerts[ak.Name()]
	if a.UnknownsNormal && event.Status == models.StUnknown {
		event.Status = models.StNormal
	}

	data := s.DataAccess.State()
	err = data.TouchAlertKey(ak, utcNow())
	if err != nil {
		return
	}

	si := silenced(ak)

	// get existing open incident if exists
	var incident *models.IncidentState
	incident, err = data.GetOpenIncident(ak)
	if err != nil {
		return
	}
	defer func() {
		// save unless incident is new and closed (log alert)
		if incident != nil && (incident.Id != 0 || incident.Open) {
			_, err = data.UpdateIncidentState(incident)
		} else {
			err = data.SetUnevaluated(ak, event.Unevaluated) // if nothing to save, at least store the unevaluated state
		}
	}()
	// If nothing is out of the ordinary we are done
	if event.Status <= models.StNormal && incident == nil {
		return
	}

	// if event is unevaluated, we are done also.
	if incident != nil {
		incident.Unevaluated = event.Unevaluated
	}
	if event.Unevaluated {
		return
	}

	shouldNotify := false
	newIncident := false
	if incident == nil {
		incident = NewIncident(ak)
		newIncident = true
		shouldNotify = true
	}

	// VICTOROPS INTEGRATION: Enables notification of incidents which have returned to normal (Sends normNotification defined in config)
	if event.Status <= models.StNormal && (incident.CurrentStatus == models.StWarning || incident.CurrentStatus == models.StCritical) {
		slog.Infof("TRIGGER_RESOLVED: from %s to %s", incident.CurrentStatus, event.Status)
		shouldNotify = true
	}

	// VICTOROPS INTEGRATION:  Enables notification of Incidents which have returned to normal but are now back to warning or critical. i.e. enable Flapping
	if incident.CurrentStatus == models.StNormal && (event.Status == models.StCritical || event.Status == models.StWarning) {
		slog.Infof("TRIGGER_REALERT: from %s to %s", incident.CurrentStatus, event.Status)
		shouldNotify = true
	}

	// set state.Result according to event result
	if event.Status == models.StCritical {
		incident.Result = event.Crit
	} else if event.Status == models.StWarning {
		incident.Result = event.Warn
	}

	if event.Status > models.StNormal {
		incident.LastAbnormalStatus = event.Status
		incident.LastAbnormalTime = event.Time.UTC().Unix()
	}
	if event.Status > incident.WorstStatus {
		incident.WorstStatus = event.Status
		shouldNotify = true
	}
	if event.Status != incident.CurrentStatus {
		incident.Events = append(incident.Events, *event)
	}
	incident.CurrentStatus = event.Status

	//run a preliminary save on new incidents to get an id
	if newIncident {
		if a.Log || silencedOrIgnored(a, event, si) {
			//a log or silenced/ignored alert will not need to be saved
		} else {
			incident.Id, err = s.DataAccess.State().UpdateIncidentState(incident)
			if err != nil {
				return
			}
		}
	}

	//render templates and open alert key if abnormal
	if event.Status > models.StNormal {
		s.executeTemplates(incident, event, a, r)
		incident.Open = true
		if a.Log {
			incident.Open = false
		}
	}

	// On state increase, clear old notifications and notify current.
	// Do nothing if state did not change.
	notify := func(ns *conf.Notifications) {
		if a.Log {
			lastLogTime := s.lastLogTimes[ak]
			now := utcNow()
			if now.Before(lastLogTime.Add(a.MaxLogFrequency)) {
				return
			}
			s.lastLogTimes[ak] = now
		}
		nots := ns.Get(s.Conf, incident.AlertKey.Group())
		for _, n := range nots {
			s.Notify(incident, n)
			checkNotify = true
		}
	}

	notifyCurrent := func() {
		//Auto close ignoreUnknowns for new incident.
		if silencedOrIgnored(a, event, si) {
			incident.Open = false
			return
		}
		// VICTOROPS INTEGRATION
		incident.NeedAck = false
		switch event.Status {
		case models.StCritical, models.StUnknown:
			notify(a.CritNotification)
		case models.StWarning:
			notify(a.WarnNotification)
		case models.StNormal:
			// VICTOROPS INTEGRATION
			incident.NeedAck = false
			notify(a.NormNotification)
		}
	}

	// lock while we change notifications.
	s.Lock("RunHistory")
	if shouldNotify {
		incident.NeedAck = false
		if err = s.DataAccess.Notifications().ClearNotifications(ak); err != nil {
			return
		}
		notifyCurrent()
	}

	// finally close an open alert with silence once it goes back to normal.
	if si := silenced(ak); si != nil && event.Status == models.StNormal {
		go func(ak models.AlertKey) {
			slog.Infof("auto close %s because was silenced", ak)
			err := s.Action("bosun", "Auto close because was silenced.", models.ActionClose, ak)
			if err != nil {
				slog.Errorln(err)
			}
		}(ak)
	}
	s.Unlock()
	return checkNotify, nil
}
Example #28
0
// RunHistory for a single alert key. Returns true if notifications were altered.
func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.Event, silenced SilenceTester) (checkNotify bool, err error) {
	event.Time = r.Start
	data := s.DataAccess.State()
	err = data.TouchAlertKey(ak, time.Now())
	if err != nil {
		return
	}
	// get existing open incident if exists
	incident, err := data.GetOpenIncident(ak)
	if err != nil {
		return
	}
	defer func() {
		// save unless incident is new and closed (log alert)
		if incident != nil && (incident.Id != 0 || incident.Open) {
			err = data.UpdateIncidentState(incident)
		} else {
			err = data.SetUnevaluated(ak, event.Unevaluated) // if nothing to save, at least store the unevaluated state
		}
	}()
	// If nothing is out of the ordinary we are done
	if event.Status <= models.StNormal && incident == nil {
		return
	}

	// if event is unevaluated, we are done also.
	if incident != nil {
		incident.Unevaluated = event.Unevaluated
	}
	if event.Unevaluated {
		return
	}

	shouldNotify := false
	if incident == nil {
		incident = NewIncident(ak)
		shouldNotify = true
	}
	// set state.Result according to event result
	if event.Status == models.StCritical {
		incident.Result = event.Crit
	} else if event.Status == models.StWarning {
		incident.Result = event.Warn
	}

	if event.Status > models.StNormal {
		incident.LastAbnormalStatus = event.Status
		incident.LastAbnormalTime = event.Time.UTC().Unix()
	}
	if event.Status > incident.WorstStatus {
		incident.WorstStatus = event.Status
		shouldNotify = true
	}
	if event.Status != incident.CurrentStatus {
		incident.Events = append(incident.Events, *event)
	}
	incident.CurrentStatus = event.Status

	a := s.Conf.Alerts[ak.Name()]
	//render templates and open alert key if abnormal
	if event.Status > models.StNormal {
		s.executeTemplates(incident, event, a, r)
		incident.Open = true
		if a.Log {
			incident.Open = false
		}
	}

	// On state increase, clear old notifications and notify current.
	// Do nothing if state did not change.
	notify := func(ns *conf.Notifications) {
		if a.Log {
			lastLogTime := s.lastLogTimes[ak]
			now := time.Now()
			if now.Before(lastLogTime.Add(a.MaxLogFrequency)) {
				return
			}
			s.lastLogTimes[ak] = now
		}
		nots := ns.Get(s.Conf, incident.AlertKey.Group())
		for _, n := range nots {
			s.Notify(incident, n)
			checkNotify = true
		}
	}

	notifyCurrent := func() {
		si := silenced(ak)
		//Auto close ignoreUnknowns for new incident.
		if a.IgnoreUnknown && event.Status == models.StUnknown {
			incident.Open = false
			return
		} else if si != nil && si.Forget && event.Status == models.StUnknown {
			incident.Open = false
			return
		}
		incident.NeedAck = true
		switch event.Status {
		case models.StCritical, models.StUnknown:
			notify(a.CritNotification)
		case models.StWarning:
			notify(a.WarnNotification)
		}
	}
	clearOld := func() {
		incident.NeedAck = false
		delete(s.Notifications, ak)
	}

	// lock while we change notifications.
	s.Lock("RunHistory")
	if shouldNotify {
		clearOld()
		notifyCurrent()
	}

	// finally close an open alert with silence once it goes back to normal.
	if si := silenced(ak); si != nil && event.Status == models.StNormal {
		go func(ak models.AlertKey) {
			slog.Infof("auto close %s because was silenced", ak)
			err := s.Action("bosun", "Auto close because was silenced.", models.ActionClose, ak)
			if err != nil {
				slog.Errorln(err)
			}
		}(ak)
	}
	s.Unlock()
	return checkNotify, nil
}
Example #29
0
func (s *Schedule) CheckExpr(T miniprofiler.Timer, rh *RunHistory, a *conf.Alert, e *expr.Expr, checkStatus models.Status, ignore models.AlertKeys) (alerts models.AlertKeys, err error) {
	if e == nil {
		return
	}
	defer func() {
		if err == nil {
			return
		}
		collect.Add("check.errs", opentsdb.TagSet{"metric": a.Name}, 1)
		slog.Errorln(err)
	}()
	results, err := s.executeExpr(T, rh, a, e)
	if err != nil {
		return nil, err
	}
Loop:
	for _, r := range results.Results {
		if s.Conf.Squelched(a, r.Group) {
			continue
		}
		ak := models.NewAlertKey(a.Name, r.Group)
		for _, v := range ignore {
			if ak == v {
				continue Loop
			}
		}
		var n float64
		n, err = valueToFloat(r.Value)
		if err != nil {
			return
		}
		event := rh.Events[ak]
		if event == nil {
			event = new(models.Event)
			rh.Events[ak] = event
		}
		result := &models.Result{
			Computations: r.Computations,
			Value:        models.Float(n),
			Expr:         e.String(),
		}
		switch checkStatus {
		case models.StWarning:
			event.Warn = result
		case models.StCritical:
			event.Crit = result
		}
		status := checkStatus
		if math.IsNaN(n) {
			status = checkStatus
		} else if n == 0 {
			status = models.StNormal
		}
		if status != models.StNormal {
			alerts = append(alerts, ak)
		}
		if status > rh.Events[ak].Status {
			event.Status = status
		}
	}
	return
}
Example #30
0
// RestoreState restores notification and alert state from the file on disk.
func (s *Schedule) RestoreState() error {
	defer func() {
		bosunStartupTime = time.Now()
	}()
	slog.Infoln("RestoreState")
	start := time.Now()
	s.Lock("RestoreState")
	defer s.Unlock()
	s.Search.Lock()
	defer s.Search.Unlock()

	s.Notifications = nil
	db := s.db
	notifications := make(map[models.AlertKey]map[string]time.Time)
	if err := decode(db, dbNotifications, &notifications); err != nil {
		slog.Errorln(dbNotifications, err)
	}

	//status := make(States)
	//	if err := decode(db, dbStatus, &status); err != nil {
	//		slog.Errorln(dbStatus, err)
	//	}
	//	clear := func(r *models.Result) {
	//		if r == nil {
	//			return
	//		}
	//		r.Computations = nil
	//}
	//TODO: ???
	//	for ak, st := range status {
	//		a, present := s.Conf.Alerts[ak.Name()]
	//		if !present {
	//			slog.Errorln("sched: alert no longer present, ignoring:", ak)
	//			continue
	//		} else if s.Conf.Squelched(a, st.Group) {
	//			slog.Infoln("sched: alert now squelched:", ak)
	//			continue
	//		} else {
	//			t := a.Unknown
	//			if t == 0 {
	//				t = s.Conf.CheckFrequency
	//			}
	//			if t == 0 && st.Last().Status == StUnknown {
	//				st.Append(&Event{Status: StNormal, IncidentId: st.Last().IncidentId})
	//			}
	//		}
	//		clear(st.Result)
	//		newHistory := []Event{}
	//		for _, e := range st.History {
	//			clear(e.Warn)
	//			clear(e.Crit)
	//			// Remove error events which no longer are a thing.
	//			if e.Status <= StUnknown {
	//				newHistory = append(newHistory, e)
	//			}
	//		}
	//		st.History = newHistory
	//		s.status[ak] = st
	//		if a.Log && st.Open {
	//			st.Open = false
	//			slog.Infof("sched: alert %s is now log, closing, was %s", ak, st.Status())
	//		}
	//	for name, t := range notifications[ak] {
	//		n, present := s.Conf.Notifications[name]
	//		if !present {
	//			slog.Infoln("sched: notification not present during restore:", name)
	//			continue
	//		}
	//		if a.Log {
	//			slog.Infoln("sched: alert is now log, removing notification:", ak)
	//			continue
	//		}
	//		s.AddNotification(ak, n, t)
	//	}
	//}
	if err := migrateOldDataToRedis(db, s.DataAccess); err != nil {
		return err
	}
	// delete metrictags if they exist.
	deleteKey(s.db, "metrictags")
	slog.Infoln("RestoreState done in", time.Since(start))
	return nil
}