func (s *Schedule) save() { if s.db == nil { return } s.Lock("Save") store := map[string]interface{}{ dbMetric: s.Search.Read.Metric, dbTagk: s.Search.Read.Tagk, dbTagv: s.Search.Read.Tagv, dbMetricTags: s.Search.Read.MetricTags, dbNotifications: s.Notifications, dbSilence: s.Silence, dbStatus: s.status, dbMetadata: s.Metadata, dbIncidents: s.Incidents, } tostore := make(map[string][]byte) for name, data := range store { f := new(bytes.Buffer) gz := gzip.NewWriter(f) cw := &counterWriter{w: gz} enc := gob.NewEncoder(cw) if err := enc.Encode(data); err != nil { slog.Errorf("error saving %s: %v", name, err) s.Unlock() return } if err := gz.Flush(); err != nil { slog.Errorf("gzip flush error saving %s: %v", name, err) } if err := gz.Close(); err != nil { slog.Errorf("gzip close error saving %s: %v", name, err) } tostore[name] = f.Bytes() slog.Infof("wrote %s: %v", name, conf.ByteSize(cw.written)) collect.Put("statefile.size", opentsdb.TagSet{"object": name}, cw.written) } s.Unlock() err := s.db.Update(func(tx *bolt.Tx) error { b, err := tx.CreateBucketIfNotExists([]byte(dbBucket)) if err != nil { return err } for name, data := range tostore { if err := b.Put([]byte(name), data); err != nil { return err } } return nil }) if err != nil { slog.Errorf("save db update error: %v", err) return } fi, err := os.Stat(s.Conf.StateFile) if err == nil { collect.Put("statefile.size", opentsdb.TagSet{"object": "total"}, fi.Size()) } slog.Infoln("save to db complete") }
func (s *Schedule) CheckAlert(T miniprofiler.Timer, r *RunHistory, a *conf.Alert) { slog.Infof("check alert %v start", a.Name) start := utcNow() for _, ak := range s.findUnknownAlerts(r.Start, a.Name) { r.Events[ak] = &models.Event{Status: models.StUnknown} } var warns, crits models.AlertKeys d, err := s.executeExpr(T, r, a, a.Depends) var deps expr.ResultSlice if err == nil { deps = filterDependencyResults(d) crits, err = s.CheckExpr(T, r, a, a.Crit, models.StCritical, nil) if err == nil { warns, err = s.CheckExpr(T, r, a, a.Warn, models.StWarning, crits) } } unevalCount, unknownCount := markDependenciesUnevaluated(r.Events, deps, a.Name) if err != nil { slog.Errorf("Error checking alert %s: %s", a.Name, err.Error()) removeUnknownEvents(r.Events, a.Name) s.markAlertError(a.Name, err) } else { s.markAlertSuccessful(a.Name) } collect.Put("check.duration", opentsdb.TagSet{"name": a.Name}, time.Since(start).Seconds()) slog.Infof("check alert %v done (%s): %v crits, %v warns, %v unevaluated, %v unknown", a.Name, time.Since(start), len(crits), len(warns), unevalCount, unknownCount) }
func (s *Schedule) CheckAlert(T miniprofiler.Timer, r *RunHistory, a *conf.Alert) (cancelled bool) { slog.Infof("check alert %v start", a.Name) start := utcNow() for _, ak := range s.findUnknownAlerts(r.Start, a.Name) { r.Events[ak] = &models.Event{Status: models.StUnknown} } var warns, crits models.AlertKeys type res struct { results *expr.Results error error } // buffered channel so go func that runs executeExpr won't leak if the Check is cancelled // by the closing of the schedule rc := make(chan res, 1) var d *expr.Results var err error go func() { d, err := s.executeExpr(T, r, a, a.Depends) rc <- res{d, err} // this will hang forever if the channel isn't buffered since nothing will ever receieve from rc }() select { case res := <-rc: d = res.results err = res.error // If the schedule closes before the expression has finised executing, we abandon the // execution of the expression case <-s.runnerContext.Done(): return true } var deps expr.ResultSlice if err == nil { deps = filterDependencyResults(d) crits, err, cancelled = s.CheckExpr(T, r, a, a.Crit, models.StCritical, nil) if err == nil && !cancelled { warns, err, cancelled = s.CheckExpr(T, r, a, a.Warn, models.StWarning, crits) } } if cancelled { return true } unevalCount, unknownCount := markDependenciesUnevaluated(r.Events, deps, a.Name) if err != nil { slog.Errorf("Error checking alert %s: %s", a.Name, err.Error()) removeUnknownEvents(r.Events, a.Name) s.markAlertError(a.Name, err) } else { s.markAlertSuccessful(a.Name) } collect.Put("check.duration", opentsdb.TagSet{"name": a.Name}, time.Since(start).Seconds()) slog.Infof("check alert %v done (%s): %v crits, %v warns, %v unevaluated, %v unknown", a.Name, time.Since(start), len(crits), len(warns), unevalCount, unknownCount) return false }
func pingHost(host string) { p := fastping.NewPinger() tags := opentsdb.TagSet{"dst_host": host} resolved := 0 defer func() { collect.Put("ping.resolved", tags, resolved) }() ra, err := net.ResolveIPAddr("ip4:icmp", host) if err != nil { return } resolved = 1 p.AddIPAddr(ra) p.MaxRTT = time.Second * 5 timeout := 1 p.OnRecv = func(addr *net.IPAddr, t time.Duration) { collect.Put("ping.rtt", tags, float64(t)/float64(time.Millisecond)) timeout = 0 } if err := p.Run(); err != nil { log.Print(err) } collect.Put("ping.timeout", tags, timeout) }
func (s *Schedule) CheckAlert(T miniprofiler.Timer, r *RunHistory, a *conf.Alert) { log.Printf("check alert %v start", a.Name) start := time.Now() var warns, crits expr.AlertKeys d, err := s.executeExpr(T, r, a, a.Depends) var deps expr.ResultSlice if err == nil { deps = filterDependencyResults(d) crits, err = s.CheckExpr(T, r, a, a.Crit, StCritical, nil) if err == nil { warns, _ = s.CheckExpr(T, r, a, a.Warn, StWarning, crits) } } unevalCount, unknownCount := markDependenciesUnevaluated(r.Events, deps, a.Name) if err != nil { removeUnknownEvents(r.Events, a.Name) } collect.Put("check.duration", opentsdb.TagSet{"name": a.Name}, time.Since(start).Seconds()) log.Printf("check alert %v done (%s): %v crits, %v warns, %v unevaluated, %v unknown", a.Name, time.Since(start), len(crits), len(warns), unevalCount, unknownCount) }
// CollectStates sends various state information to bosun with collect. func (s *Schedule) CollectStates() { // [AlertName][Severity]Count severityCounts := make(map[string]map[string]int64) abnormalCounts := make(map[string]map[string]int64) ackStatusCounts := make(map[string]map[bool]int64) ackByNotificationCounts := make(map[string]map[bool]int64) unAckOldestByNotification := make(map[string]time.Time) activeStatusCounts := make(map[string]map[bool]int64) // Initalize the Counts for _, alert := range s.Conf.Alerts { severityCounts[alert.Name] = make(map[string]int64) abnormalCounts[alert.Name] = make(map[string]int64) var i models.Status for i = 1; i.String() != "none"; i++ { severityCounts[alert.Name][i.String()] = 0 abnormalCounts[alert.Name][i.String()] = 0 } ackStatusCounts[alert.Name] = make(map[bool]int64) activeStatusCounts[alert.Name] = make(map[bool]int64) ackStatusCounts[alert.Name][false] = 0 activeStatusCounts[alert.Name][false] = 0 ackStatusCounts[alert.Name][true] = 0 activeStatusCounts[alert.Name][true] = 0 } for notificationName := range s.Conf.Notifications { unAckOldestByNotification[notificationName] = time.Unix(1<<63-62135596801, 999999999) ackByNotificationCounts[notificationName] = make(map[bool]int64) ackByNotificationCounts[notificationName][false] = 0 ackByNotificationCounts[notificationName][true] = 0 } //TODO: // for _, state := range s.status { // if !state.Open { // continue // } // name := state.AlertKey.Name() // alertDef := s.Conf.Alerts[name] // nots := make(map[string]bool) // for name := range alertDef.WarnNotification.Get(s.Conf, state.Group) { // nots[name] = true // } // for name := range alertDef.CritNotification.Get(s.Conf, state.Group) { // nots[name] = true // } // incident, err := s.GetIncident(state.Last().IncidentId) // if err != nil { // slog.Errorln(err) // } // for notificationName := range nots { // ackByNotificationCounts[notificationName][state.NeedAck]++ // if incident != nil && incident.Start.Before(unAckOldestByNotification[notificationName]) && state.NeedAck { // unAckOldestByNotification[notificationName] = incident.Start // } // } // severity := state.CurrentStatus.String() // lastAbnormal := state.LastAbnormalStatus.String() // severityCounts[state.Alert][severity]++ // abnormalCounts[state.Alert][lastAbnormal]++ // ackStatusCounts[state.Alert][state.NeedAck]++ // activeStatusCounts[state.Alert][state.IsActive()]++ // } for notification := range ackByNotificationCounts { ts := opentsdb.TagSet{"notification": notification} err := collect.Put("alerts.acknowledgement_status_by_notification", ts.Copy().Merge(opentsdb.TagSet{"status": "unacknowledged"}), ackByNotificationCounts[notification][true]) if err != nil { slog.Errorln(err) } err = collect.Put("alerts.acknowledgement_status_by_notification", ts.Copy().Merge(opentsdb.TagSet{"status": "acknowledged"}), ackByNotificationCounts[notification][false]) if err != nil { slog.Errorln(err) } } for notification, timeStamp := range unAckOldestByNotification { ts := opentsdb.TagSet{"notification": notification} var ago time.Duration if !timeStamp.Equal(time.Unix(1<<63-62135596801, 999999999)) { ago = utcNow().Sub(timeStamp) } err := collect.Put("alerts.oldest_unacked_by_notification", ts, ago.Seconds()) if err != nil { slog.Errorln(err) } } for alertName := range severityCounts { ts := opentsdb.TagSet{"alert": alertName} // The tagset of the alert is not included because there is no way to // store the string of a group in OpenTSBD in a parsable way. This is // because any delimiter we chose could also be part of a tag key or tag // value. for severity := range severityCounts[alertName] { err := collect.Put("alerts.current_severity", ts.Copy().Merge(opentsdb.TagSet{"severity": severity}), severityCounts[alertName][severity]) if err != nil { slog.Errorln(err) } err = collect.Put("alerts.last_abnormal_severity", ts.Copy().Merge(opentsdb.TagSet{"severity": severity}), abnormalCounts[alertName][severity]) if err != nil { slog.Errorln(err) } } err := collect.Put("alerts.acknowledgement_status", ts.Copy().Merge(opentsdb.TagSet{"status": "unacknowledged"}), ackStatusCounts[alertName][true]) err = collect.Put("alerts.acknowledgement_status", ts.Copy().Merge(opentsdb.TagSet{"status": "acknowledged"}), ackStatusCounts[alertName][false]) if err != nil { slog.Errorln(err) } err = collect.Put("alerts.active_status", ts.Copy().Merge(opentsdb.TagSet{"status": "active"}), activeStatusCounts[alertName][true]) if err != nil { slog.Errorln(err) } err = collect.Put("alerts.active_status", ts.Copy().Merge(opentsdb.TagSet{"status": "inactive"}), activeStatusCounts[alertName][false]) if err != nil { slog.Errorln(err) } } }
func main() { flag.Parse() if *flagToToml != "" { toToml(*flagToToml) fmt.Println("toml conversion complete; remove all empty values by hand (empty strings, 0)") return } if *flagPrint || *flagDebug { slog.Set(&slog.StdLog{Log: log.New(os.Stdout, "", log.LstdFlags)}) } if *flagVersion { fmt.Println(version.GetVersionInfo("scollector")) os.Exit(0) } for _, m := range mains { m() } conf := readConf() if *flagHost != "" { conf.Host = *flagHost } if *flagFilter != "" { conf.Filter = strings.Split(*flagFilter, ",") } if !conf.Tags.Valid() { slog.Fatalf("invalid tags: %v", conf.Tags) } else if conf.Tags["host"] != "" { slog.Fatalf("host not supported in custom tags, use Hostname instead") } if conf.PProf != "" { go func() { slog.Infof("Starting pprof at http://%s/debug/pprof/", conf.PProf) slog.Fatal(http.ListenAndServe(conf.PProf, nil)) }() } collectors.AddTags = conf.Tags util.FullHostname = conf.FullHost util.Set() if conf.Hostname != "" { util.Hostname = conf.Hostname } if err := collect.SetHostname(util.Hostname); err != nil { slog.Fatal(err) } if conf.ColDir != "" { collectors.InitPrograms(conf.ColDir) } var err error check := func(e error) { if e != nil { err = e } } collectors.Init(conf) for _, r := range conf.MetricFilters { check(collectors.AddMetricFilters(r)) } for _, rmq := range conf.RabbitMQ { check(collectors.RabbitMQ(rmq.URL)) } for _, cfg := range conf.SNMP { check(collectors.SNMP(cfg, conf.MIBS)) } for _, i := range conf.ICMP { check(collectors.ICMP(i.Host)) } for _, a := range conf.AWS { check(collectors.AWS(a.AccessKey, a.SecretKey, a.Region)) } for _, v := range conf.Vsphere { check(collectors.Vsphere(v.User, v.Password, v.Host)) } for _, p := range conf.Process { check(collectors.AddProcessConfig(p)) } for _, p := range conf.ProcessDotNet { check(collectors.AddProcessDotNetConfig(p)) } for _, h := range conf.HTTPUnit { if h.TOML != "" { check(collectors.HTTPUnitTOML(h.TOML)) } if h.Hiera != "" { check(collectors.HTTPUnitHiera(h.Hiera)) } } for _, r := range conf.Riak { check(collectors.Riak(r.URL)) } for _, x := range conf.ExtraHop { check(collectors.ExtraHop(x.Host, x.APIKey, x.FilterBy, x.FilterPercent)) } if err != nil { slog.Fatal(err) } collectors.KeepalivedCommunity = conf.KeepalivedCommunity // Add all process collectors. This is platform specific. collectors.WatchProcesses() collectors.WatchProcessesDotNet() if *flagFake > 0 { collectors.InitFake(*flagFake) } collect.Debug = *flagDebug util.Debug = *flagDebug collect.DisableDefaultCollectors = conf.DisableSelf c := collectors.Search(conf.Filter) if len(c) == 0 { slog.Fatalf("Filter %v matches no collectors.", conf.Filter) } for _, col := range c { col.Init() } u, err := parseHost(conf.Host) if *flagList { list(c) return } else if *flagPrint { u = &url.URL{Scheme: "http", Host: "localhost:0"} } else if err != nil { slog.Fatalf("invalid host %v: %v", conf.Host, err) } freq := time.Second * time.Duration(conf.Freq) if freq <= 0 { slog.Fatal("freq must be > 0") } collectors.DefaultFreq = freq collect.Freq = freq if conf.BatchSize < 0 { slog.Fatal("BatchSize must be > 0") } if conf.BatchSize != 0 { collect.BatchSize = conf.BatchSize } collect.Tags = conf.Tags.Copy().Merge(opentsdb.TagSet{"os": runtime.GOOS}) if *flagPrint { collect.Print = true } if !*flagDisableMetadata { if err := metadata.Init(u, *flagDebug); err != nil { slog.Fatal(err) } } cdp, cquit := collectors.Run(c) if u != nil { slog.Infoln("OpenTSDB host:", u) } if err := collect.InitChan(u, "scollector", cdp); err != nil { slog.Fatal(err) } if version.VersionDate != "" { v, err := strconv.ParseInt(version.VersionDate, 10, 64) if err == nil { go func() { metadata.AddMetricMeta("scollector.version", metadata.Gauge, metadata.None, "Scollector version number, which indicates when scollector was built.") for { if err := collect.Put("version", collect.Tags, v); err != nil { slog.Error(err) } time.Sleep(time.Hour) } }() } } if *flagBatchSize > 0 { collect.BatchSize = *flagBatchSize } go func() { const maxMem = 500 * 1024 * 1024 // 500MB var m runtime.MemStats for range time.Tick(time.Minute) { runtime.ReadMemStats(&m) if m.Alloc > maxMem { panic("memory max reached") } } }() sChan := make(chan os.Signal) signal.Notify(sChan, os.Interrupt) <-sChan close(cquit) // try to flush all datapoints on sigterm, but quit after 5 seconds no matter what. time.AfterFunc(5*time.Second, func() { os.Exit(0) }) collect.Flush() }
func main() { flag.Parse() if *flagToToml != "" { toToml(*flagToToml) fmt.Println("toml conversion complete; remove all empty values by hand (empty strings, 0)") return } if *flagPrint || *flagDebug { slog.Set(&slog.StdLog{Log: log.New(os.Stdout, "", log.LstdFlags)}) } if *flagVersion { fmt.Println(version.GetVersionInfo("scollector")) os.Exit(0) } for _, m := range mains { m() } conf := readConf() ua := "Scollector/" + version.ShortVersion() if conf.UserAgentMessage != "" { ua += fmt.Sprintf(" (%s)", conf.UserAgentMessage) } client := &http.Client{ Transport: &scollectorHTTPTransport{ ua, &httpcontrol.Transport{ RequestTimeout: time.Minute, }, }, } http.DefaultClient = client collect.DefaultClient = client if *flagHost != "" { conf.Host = *flagHost } if *flagNtlm { conf.UseNtlm = *flagNtlm } if *flagFilter != "" { conf.Filter = strings.Split(*flagFilter, ",") } if !conf.Tags.Valid() { slog.Fatalf("invalid tags: %v", conf.Tags) } else if conf.Tags["host"] != "" { slog.Fatalf("host not supported in custom tags, use Hostname instead") } if conf.PProf != "" { go func() { slog.Infof("Starting pprof at http://%s/debug/pprof/", conf.PProf) slog.Fatal(http.ListenAndServe(conf.PProf, nil)) }() } collectors.AddTags = conf.Tags util.FullHostname = conf.FullHost util.Set() if conf.Hostname != "" { util.Hostname = conf.Hostname } if err := collect.SetHostname(util.Hostname); err != nil { slog.Fatal(err) } if conf.ColDir != "" { collectors.InitPrograms(conf.ColDir) } if conf.SNMPTimeout > 0 { snmp.Timeout = conf.SNMPTimeout } var err error check := func(e error) { if e != nil { err = e } } collectors.Init(conf) for _, r := range conf.MetricFilters { slog.Infof("Adding MetricFilter: %v\n", r) check(collectors.AddMetricFilters(r)) } for _, rmq := range conf.RabbitMQ { check(collectors.RabbitMQ(rmq.URL)) } for _, cfg := range conf.SNMP { check(collectors.SNMP(cfg, conf.MIBS)) } for _, i := range conf.ICMP { check(collectors.ICMP(i.Host)) } for _, a := range conf.AWS { check(collectors.AWS(a.AccessKey, a.SecretKey, a.Region, a.BillingProductCodesRegex, a.BillingBucketName, a.BillingBucketPath, a.BillingPurgeDays)) } for _, ea := range conf.AzureEA { check(collectors.AzureEABilling(ea.EANumber, ea.APIKey, ea.LogBillingDetails)) } for _, v := range conf.Vsphere { check(collectors.Vsphere(v.User, v.Password, v.Host)) } for _, p := range conf.Process { check(collectors.AddProcessConfig(p)) } for _, p := range conf.ProcessDotNet { check(collectors.AddProcessDotNetConfig(p)) } for _, h := range conf.HTTPUnit { var freq time.Duration var parseerr error if h.Freq == "" { freq = time.Minute * 5 } else { freq, parseerr = time.ParseDuration(h.Freq) if parseerr != nil { slog.Fatal(parseerr) } if freq < time.Second { slog.Fatalf("Invalid HTTPUnit frequency %s, cannot be less than 1 second.", h.Freq) } } if h.TOML != "" { check(collectors.HTTPUnitTOML(h.TOML, freq)) } if h.Hiera != "" { check(collectors.HTTPUnitHiera(h.Hiera, freq)) } } for _, r := range conf.Riak { check(collectors.Riak(r.URL)) } for _, x := range conf.ExtraHop { check(collectors.ExtraHop(x.Host, x.APIKey, x.FilterBy, x.FilterPercent, x.AdditionalMetrics, x.CertificateSubjectMatch, x.CertificateActivityGroup)) } if err != nil { slog.Fatal(err) } collectors.KeepalivedCommunity = conf.KeepalivedCommunity // Add all process collectors. This is platform specific. collectors.WatchProcesses() collectors.WatchProcessesDotNet() if *flagFake > 0 { collectors.InitFake(*flagFake) } collect.Debug = *flagDebug util.Debug = *flagDebug collect.DisableDefaultCollectors = conf.DisableSelf c := collectors.Search(conf.Filter) if len(c) == 0 { slog.Fatalf("Filter %v matches no collectors.", conf.Filter) } for _, col := range c { col.Init() } err = collectors.AddTagOverrides(c, conf.TagOverride) if err != nil { slog.Fatalf("Error adding tag overrides: %s", err) } u, err := parseHost(conf.Host) if *flagList { list(c) return } else if *flagPrint { u = &url.URL{Scheme: "http", Host: "localhost:0"} } else if err != nil { slog.Fatalf("invalid host %v: %v", conf.Host, err) } freq := time.Second * time.Duration(conf.Freq) if freq <= 0 { slog.Fatal("freq must be > 0") } collectors.DefaultFreq = freq collect.Freq = freq if conf.BatchSize < 0 { slog.Fatal("BatchSize must be > 0") } if conf.BatchSize != 0 { collect.BatchSize = conf.BatchSize } collect.Tags = conf.Tags.Copy().Merge(opentsdb.TagSet{"os": runtime.GOOS}) if *flagPrint { collect.Print = true } if !*flagDisableMetadata { if err := metadata.Init(u, *flagDebug); err != nil { slog.Fatal(err) } } cdp, cquit := collectors.Run(c) if u != nil { slog.Infoln("OpenTSDB host:", u) } collect.UseNtlm = conf.UseNtlm if err := collect.InitChan(u, "scollector", cdp); err != nil { slog.Fatal(err) } if collect.DisableDefaultCollectors == false && version.VersionDate != "" { v, err := strconv.ParseInt(version.VersionDate, 10, 64) if err == nil { go func() { metadata.AddMetricMeta("scollector.version", metadata.Gauge, metadata.None, "Scollector version number, which indicates when scollector was built.") for { if err := collect.Put("version", collect.Tags, v); err != nil { slog.Error(err) } time.Sleep(time.Hour) } }() } } if *flagBatchSize > 0 { collect.BatchSize = *flagBatchSize } if conf.MaxQueueLen != 0 { if conf.MaxQueueLen < collect.BatchSize { slog.Fatalf("MaxQueueLen must be >= %d (BatchSize)", collect.BatchSize) } collect.MaxQueueLen = conf.MaxQueueLen } maxMemMB := uint64(500) if conf.MaxMem != 0 { maxMemMB = conf.MaxMem } go func() { var m runtime.MemStats for range time.Tick(time.Second * 30) { runtime.ReadMemStats(&m) allocMB := m.Alloc / 1024 / 1024 if allocMB > maxMemMB { slog.Fatalf("memory max runtime reached: (current alloc: %v megabytes, max: %v megabytes)", allocMB, maxMemMB) } //See proccess_windows.go and process_linux.go for total process memory usage. //Note that in linux the rss metric includes shared pages, where as in //Windows the private working set does not include shared memory. //Total memory used seems to scale linerarly with m.Alloc. //But we want this to catch a memory leak outside the runtime (WMI/CGO). //So for now just add any runtime allocations to the allowed total limit. maxMemTotalMB := maxMemMB + allocMB if collectors.TotalScollectorMemoryMB > maxMemTotalMB { slog.Fatalf("memory max total reached: (current total: %v megabytes, current runtime alloc: %v megabytes, max: %v megabytes)", collectors.TotalScollectorMemoryMB, allocMB, maxMemTotalMB) } } }() sChan := make(chan os.Signal) signal.Notify(sChan, os.Interrupt) <-sChan close(cquit) // try to flush all datapoints on sigterm, but quit after 5 seconds no matter what. time.AfterFunc(5*time.Second, func() { os.Exit(0) }) collect.Flush() }
// CollectStates sends various state information to bosun with collect. func (s *Schedule) CollectStates() { // [AlertName][Severity]Count severityCounts := make(map[string]map[string]int64) abnormalCounts := make(map[string]map[string]int64) ackStatusCounts := make(map[string]map[bool]int64) activeStatusCounts := make(map[string]map[bool]int64) // Initalize the Counts for _, alert := range s.Conf.Alerts { severityCounts[alert.Name] = make(map[string]int64) abnormalCounts[alert.Name] = make(map[string]int64) var i Status for i = 1; i.String() != "none"; i++ { severityCounts[alert.Name][i.String()] = 0 abnormalCounts[alert.Name][i.String()] = 0 } ackStatusCounts[alert.Name] = make(map[bool]int64) activeStatusCounts[alert.Name] = make(map[bool]int64) ackStatusCounts[alert.Name][false] = 0 activeStatusCounts[alert.Name][false] = 0 ackStatusCounts[alert.Name][true] = 0 activeStatusCounts[alert.Name][true] = 0 } for _, state := range s.status { if !state.Open { continue } severity := state.Status().String() lastAbnormal := state.AbnormalStatus().String() severityCounts[state.Alert][severity]++ abnormalCounts[state.Alert][lastAbnormal]++ ackStatusCounts[state.Alert][state.NeedAck]++ activeStatusCounts[state.Alert][state.IsActive()]++ } for alertName := range severityCounts { ts := opentsdb.TagSet{"alert": alertName} // The tagset of the alert is not included because there is no way to // store the string of a group in OpenTSBD in a parsable way. This is // because any delimiter we chose could also be part of a tag key or tag // value. for severity := range severityCounts[alertName] { err := collect.Put("alerts.current_severity", ts.Copy().Merge(opentsdb.TagSet{"severity": severity}), severityCounts[alertName][severity]) if err != nil { slog.Errorln(err) } err = collect.Put("alerts.last_abnormal_severity", ts.Copy().Merge(opentsdb.TagSet{"severity": severity}), abnormalCounts[alertName][severity]) if err != nil { slog.Errorln(err) } } err := collect.Put("alerts.acknowledgement_status", ts.Copy().Merge(opentsdb.TagSet{"status": "unacknowledged"}), ackStatusCounts[alertName][true]) err = collect.Put("alerts.acknowledgement_status", ts.Copy().Merge(opentsdb.TagSet{"status": "acknowledged"}), ackStatusCounts[alertName][false]) if err != nil { slog.Errorln(err) } err = collect.Put("alerts.active_status", ts.Copy().Merge(opentsdb.TagSet{"status": "active"}), activeStatusCounts[alertName][true]) if err != nil { slog.Errorln(err) } err = collect.Put("alerts.active_status", ts.Copy().Merge(opentsdb.TagSet{"status": "inactive"}), activeStatusCounts[alertName][false]) if err != nil { slog.Errorln(err) } } }
func main() { flag.Parse() if *flagToToml != "" { toToml(*flagToToml) fmt.Println("toml conversion complete; remove all empty values by hand (empty strings, 0)") return } if *flagPrint || *flagDebug { slog.Set(&slog.StdLog{Log: log.New(os.Stdout, "", log.LstdFlags)}) } if *flagVersion { fmt.Println(version.GetVersionInfo("scollector")) os.Exit(0) } for _, m := range mains { m() } conf := readConf() if *flagHost != "" { conf.Host = *flagHost } if *flagFilter != "" { conf.Filter = strings.Split(*flagFilter, ",") } if !conf.Tags.Valid() { slog.Fatalf("invalid tags: %v", conf.Tags) } else if conf.Tags["host"] != "" { slog.Fatalf("host not supported in custom tags, use Hostname instead") } collectors.AddTags = conf.Tags util.FullHostname = conf.FullHost util.Set() if conf.Hostname != "" { util.Hostname = conf.Hostname if err := collect.SetHostname(conf.Hostname); err != nil { slog.Fatal(err) } } if conf.ColDir != "" { collectors.InitPrograms(conf.ColDir) } var err error check := func(e error) { if e != nil { err = e } } for _, h := range conf.HAProxy { for _, i := range h.Instances { collectors.HAProxy(h.User, h.Password, i.Tier, i.URL) } } for _, s := range conf.SNMP { check(collectors.SNMP(s.Community, s.Host)) } for _, i := range conf.ICMP { check(collectors.ICMP(i.Host)) } for _, a := range conf.AWS { check(collectors.AWS(a.AccessKey, a.SecretKey, a.Region)) } for _, v := range conf.Vsphere { check(collectors.Vsphere(v.User, v.Password, v.Host)) } for _, p := range conf.Process { check(collectors.AddProcessConfig(p)) } for _, h := range conf.HTTPUnit { if h.TOML != "" { check(collectors.HTTPUnitTOML(h.TOML)) } if h.Hiera != "" { check(collectors.HTTPUnitHiera(h.Hiera)) } } if err != nil { slog.Fatal(err) } collectors.KeepalivedCommunity = conf.KeepalivedCommunity // Add all process collectors. This is platform specific. collectors.WatchProcesses() collectors.WatchProcessesDotNet() if *flagFake > 0 { collectors.InitFake(*flagFake) } collect.Debug = *flagDebug util.Debug = *flagDebug collect.DisableDefaultCollectors = conf.DisableSelf c := collectors.Search(conf.Filter) if len(c) == 0 { slog.Fatalf("Filter %v matches no collectors.", conf.Filter) } for _, col := range c { col.Init() } u, err := parseHost(conf.Host) if *flagList { list(c) return } else if err != nil { slog.Fatalf("invalid host %v: %v", conf.Host, err) } freq := time.Second * time.Duration(conf.Freq) if freq <= 0 { slog.Fatal("freq must be > 0") } collectors.DefaultFreq = freq collect.Freq = freq collect.Tags = opentsdb.TagSet{"os": runtime.GOOS} if *flagPrint { collect.Print = true } if !*flagDisableMetadata { if err := metadata.Init(u, *flagDebug); err != nil { slog.Fatal(err) } } cdp := collectors.Run(c) if u != nil { slog.Infoln("OpenTSDB host:", u) } if err := collect.InitChan(u, "scollector", cdp); err != nil { slog.Fatal(err) } if version.VersionDate != "" { v, err := strconv.ParseInt(version.VersionDate, 10, 64) if err == nil { go func() { metadata.AddMetricMeta("scollector.version", metadata.Gauge, metadata.None, "Scollector version number, which indicates when scollector was built.") for { if err := collect.Put("version", collect.Tags, v); err != nil { slog.Error(err) } time.Sleep(time.Hour) } }() } } if *flagBatchSize > 0 { collect.BatchSize = *flagBatchSize } go func() { const maxMem = 500 * 1024 * 1024 // 500MB var m runtime.MemStats for range time.Tick(time.Minute) { runtime.ReadMemStats(&m) if m.Alloc > maxMem { panic("memory max reached") } } }() select {} }