Exemple #1
0
func (s *Schedule) save() {
	if s.db == nil {
		return
	}
	s.Lock("Save")
	store := map[string]interface{}{
		dbMetric:        s.Search.Read.Metric,
		dbTagk:          s.Search.Read.Tagk,
		dbTagv:          s.Search.Read.Tagv,
		dbMetricTags:    s.Search.Read.MetricTags,
		dbNotifications: s.Notifications,
		dbSilence:       s.Silence,
		dbStatus:        s.status,
		dbMetadata:      s.Metadata,
		dbIncidents:     s.Incidents,
	}
	tostore := make(map[string][]byte)
	for name, data := range store {
		f := new(bytes.Buffer)
		gz := gzip.NewWriter(f)
		cw := &counterWriter{w: gz}
		enc := gob.NewEncoder(cw)
		if err := enc.Encode(data); err != nil {
			slog.Errorf("error saving %s: %v", name, err)
			s.Unlock()
			return
		}
		if err := gz.Flush(); err != nil {
			slog.Errorf("gzip flush error saving %s: %v", name, err)
		}
		if err := gz.Close(); err != nil {
			slog.Errorf("gzip close error saving %s: %v", name, err)
		}
		tostore[name] = f.Bytes()
		slog.Infof("wrote %s: %v", name, conf.ByteSize(cw.written))
		collect.Put("statefile.size", opentsdb.TagSet{"object": name}, cw.written)
	}
	s.Unlock()
	err := s.db.Update(func(tx *bolt.Tx) error {
		b, err := tx.CreateBucketIfNotExists([]byte(dbBucket))
		if err != nil {
			return err
		}
		for name, data := range tostore {
			if err := b.Put([]byte(name), data); err != nil {
				return err
			}
		}
		return nil
	})
	if err != nil {
		slog.Errorf("save db update error: %v", err)
		return
	}
	fi, err := os.Stat(s.Conf.StateFile)
	if err == nil {
		collect.Put("statefile.size", opentsdb.TagSet{"object": "total"}, fi.Size())
	}
	slog.Infoln("save to db complete")
}
Exemple #2
0
func (s *Schedule) CheckAlert(T miniprofiler.Timer, r *RunHistory, a *conf.Alert) {
	slog.Infof("check alert %v start", a.Name)
	start := utcNow()
	for _, ak := range s.findUnknownAlerts(r.Start, a.Name) {
		r.Events[ak] = &models.Event{Status: models.StUnknown}
	}
	var warns, crits models.AlertKeys
	d, err := s.executeExpr(T, r, a, a.Depends)
	var deps expr.ResultSlice
	if err == nil {
		deps = filterDependencyResults(d)
		crits, err = s.CheckExpr(T, r, a, a.Crit, models.StCritical, nil)
		if err == nil {
			warns, err = s.CheckExpr(T, r, a, a.Warn, models.StWarning, crits)
		}
	}
	unevalCount, unknownCount := markDependenciesUnevaluated(r.Events, deps, a.Name)
	if err != nil {
		slog.Errorf("Error checking alert %s: %s", a.Name, err.Error())
		removeUnknownEvents(r.Events, a.Name)
		s.markAlertError(a.Name, err)
	} else {
		s.markAlertSuccessful(a.Name)
	}
	collect.Put("check.duration", opentsdb.TagSet{"name": a.Name}, time.Since(start).Seconds())
	slog.Infof("check alert %v done (%s): %v crits, %v warns, %v unevaluated, %v unknown", a.Name, time.Since(start), len(crits), len(warns), unevalCount, unknownCount)
}
Exemple #3
0
func (s *Schedule) CheckAlert(T miniprofiler.Timer, r *RunHistory, a *conf.Alert) (cancelled bool) {
	slog.Infof("check alert %v start", a.Name)
	start := utcNow()
	for _, ak := range s.findUnknownAlerts(r.Start, a.Name) {
		r.Events[ak] = &models.Event{Status: models.StUnknown}
	}
	var warns, crits models.AlertKeys
	type res struct {
		results *expr.Results
		error   error
	}
	// buffered channel so go func that runs executeExpr won't leak if the Check is cancelled
	// by the closing of the schedule
	rc := make(chan res, 1)
	var d *expr.Results
	var err error
	go func() {
		d, err := s.executeExpr(T, r, a, a.Depends)
		rc <- res{d, err} // this will hang forever if the channel isn't buffered since nothing will ever receieve from rc
	}()
	select {
	case res := <-rc:
		d = res.results
		err = res.error
	// If the schedule closes before the expression has finised executing, we abandon the
	// execution of the expression
	case <-s.runnerContext.Done():
		return true
	}
	var deps expr.ResultSlice
	if err == nil {
		deps = filterDependencyResults(d)
		crits, err, cancelled = s.CheckExpr(T, r, a, a.Crit, models.StCritical, nil)
		if err == nil && !cancelled {
			warns, err, cancelled = s.CheckExpr(T, r, a, a.Warn, models.StWarning, crits)
		}
	}
	if cancelled {
		return true
	}
	unevalCount, unknownCount := markDependenciesUnevaluated(r.Events, deps, a.Name)
	if err != nil {
		slog.Errorf("Error checking alert %s: %s", a.Name, err.Error())
		removeUnknownEvents(r.Events, a.Name)
		s.markAlertError(a.Name, err)
	} else {
		s.markAlertSuccessful(a.Name)
	}
	collect.Put("check.duration", opentsdb.TagSet{"name": a.Name}, time.Since(start).Seconds())
	slog.Infof("check alert %v done (%s): %v crits, %v warns, %v unevaluated, %v unknown", a.Name, time.Since(start), len(crits), len(warns), unevalCount, unknownCount)
	return false
}
Exemple #4
0
func pingHost(host string) {
	p := fastping.NewPinger()
	tags := opentsdb.TagSet{"dst_host": host}
	resolved := 0
	defer func() {
		collect.Put("ping.resolved", tags, resolved)
	}()
	ra, err := net.ResolveIPAddr("ip4:icmp", host)
	if err != nil {
		return
	}
	resolved = 1
	p.AddIPAddr(ra)
	p.MaxRTT = time.Second * 5
	timeout := 1
	p.OnRecv = func(addr *net.IPAddr, t time.Duration) {
		collect.Put("ping.rtt", tags, float64(t)/float64(time.Millisecond))
		timeout = 0
	}
	if err := p.Run(); err != nil {
		log.Print(err)
	}
	collect.Put("ping.timeout", tags, timeout)
}
Exemple #5
0
func (s *Schedule) CheckAlert(T miniprofiler.Timer, r *RunHistory, a *conf.Alert) {
	log.Printf("check alert %v start", a.Name)
	start := time.Now()
	var warns, crits expr.AlertKeys
	d, err := s.executeExpr(T, r, a, a.Depends)
	var deps expr.ResultSlice
	if err == nil {
		deps = filterDependencyResults(d)
		crits, err = s.CheckExpr(T, r, a, a.Crit, StCritical, nil)
		if err == nil {
			warns, _ = s.CheckExpr(T, r, a, a.Warn, StWarning, crits)
		}
	}
	unevalCount, unknownCount := markDependenciesUnevaluated(r.Events, deps, a.Name)
	if err != nil {
		removeUnknownEvents(r.Events, a.Name)
	}

	collect.Put("check.duration", opentsdb.TagSet{"name": a.Name}, time.Since(start).Seconds())
	log.Printf("check alert %v done (%s): %v crits, %v warns, %v unevaluated, %v unknown", a.Name, time.Since(start), len(crits), len(warns), unevalCount, unknownCount)
}
Exemple #6
0
// CollectStates sends various state information to bosun with collect.
func (s *Schedule) CollectStates() {
	// [AlertName][Severity]Count
	severityCounts := make(map[string]map[string]int64)
	abnormalCounts := make(map[string]map[string]int64)
	ackStatusCounts := make(map[string]map[bool]int64)
	ackByNotificationCounts := make(map[string]map[bool]int64)
	unAckOldestByNotification := make(map[string]time.Time)
	activeStatusCounts := make(map[string]map[bool]int64)
	// Initalize the Counts
	for _, alert := range s.Conf.Alerts {
		severityCounts[alert.Name] = make(map[string]int64)
		abnormalCounts[alert.Name] = make(map[string]int64)
		var i models.Status
		for i = 1; i.String() != "none"; i++ {
			severityCounts[alert.Name][i.String()] = 0
			abnormalCounts[alert.Name][i.String()] = 0
		}
		ackStatusCounts[alert.Name] = make(map[bool]int64)
		activeStatusCounts[alert.Name] = make(map[bool]int64)
		ackStatusCounts[alert.Name][false] = 0
		activeStatusCounts[alert.Name][false] = 0
		ackStatusCounts[alert.Name][true] = 0
		activeStatusCounts[alert.Name][true] = 0
	}
	for notificationName := range s.Conf.Notifications {
		unAckOldestByNotification[notificationName] = time.Unix(1<<63-62135596801, 999999999)
		ackByNotificationCounts[notificationName] = make(map[bool]int64)
		ackByNotificationCounts[notificationName][false] = 0
		ackByNotificationCounts[notificationName][true] = 0
	}
	//TODO:
	//	for _, state := range s.status {
	//		if !state.Open {
	//			continue
	//		}
	//		name := state.AlertKey.Name()
	//		alertDef := s.Conf.Alerts[name]
	//		nots := make(map[string]bool)
	//		for name := range alertDef.WarnNotification.Get(s.Conf, state.Group) {
	//			nots[name] = true
	//		}
	//		for name := range alertDef.CritNotification.Get(s.Conf, state.Group) {
	//			nots[name] = true
	//		}
	//		incident, err := s.GetIncident(state.Last().IncidentId)
	//		if err != nil {
	//			slog.Errorln(err)
	//		}
	//		for notificationName := range nots {
	//			ackByNotificationCounts[notificationName][state.NeedAck]++
	//			if incident != nil && incident.Start.Before(unAckOldestByNotification[notificationName]) && state.NeedAck {
	//				unAckOldestByNotification[notificationName] = incident.Start
	//			}
	//		}
	//		severity := state.CurrentStatus.String()
	//		lastAbnormal := state.LastAbnormalStatus.String()
	//		severityCounts[state.Alert][severity]++
	//		abnormalCounts[state.Alert][lastAbnormal]++
	//		ackStatusCounts[state.Alert][state.NeedAck]++
	//		activeStatusCounts[state.Alert][state.IsActive()]++
	//	}
	for notification := range ackByNotificationCounts {
		ts := opentsdb.TagSet{"notification": notification}
		err := collect.Put("alerts.acknowledgement_status_by_notification",
			ts.Copy().Merge(opentsdb.TagSet{"status": "unacknowledged"}),
			ackByNotificationCounts[notification][true])
		if err != nil {
			slog.Errorln(err)
		}
		err = collect.Put("alerts.acknowledgement_status_by_notification",
			ts.Copy().Merge(opentsdb.TagSet{"status": "acknowledged"}),
			ackByNotificationCounts[notification][false])
		if err != nil {
			slog.Errorln(err)
		}
	}
	for notification, timeStamp := range unAckOldestByNotification {
		ts := opentsdb.TagSet{"notification": notification}
		var ago time.Duration
		if !timeStamp.Equal(time.Unix(1<<63-62135596801, 999999999)) {
			ago = utcNow().Sub(timeStamp)
		}
		err := collect.Put("alerts.oldest_unacked_by_notification",
			ts,
			ago.Seconds())
		if err != nil {
			slog.Errorln(err)
		}
	}
	for alertName := range severityCounts {
		ts := opentsdb.TagSet{"alert": alertName}
		// The tagset of the alert is not included because there is no way to
		// store the string of a group in OpenTSBD in a parsable way. This is
		// because any delimiter we chose could also be part of a tag key or tag
		// value.
		for severity := range severityCounts[alertName] {
			err := collect.Put("alerts.current_severity",
				ts.Copy().Merge(opentsdb.TagSet{"severity": severity}),
				severityCounts[alertName][severity])
			if err != nil {
				slog.Errorln(err)
			}
			err = collect.Put("alerts.last_abnormal_severity",
				ts.Copy().Merge(opentsdb.TagSet{"severity": severity}),
				abnormalCounts[alertName][severity])
			if err != nil {
				slog.Errorln(err)
			}
		}
		err := collect.Put("alerts.acknowledgement_status",
			ts.Copy().Merge(opentsdb.TagSet{"status": "unacknowledged"}),
			ackStatusCounts[alertName][true])
		err = collect.Put("alerts.acknowledgement_status",
			ts.Copy().Merge(opentsdb.TagSet{"status": "acknowledged"}),
			ackStatusCounts[alertName][false])
		if err != nil {
			slog.Errorln(err)
		}
		err = collect.Put("alerts.active_status",
			ts.Copy().Merge(opentsdb.TagSet{"status": "active"}),
			activeStatusCounts[alertName][true])
		if err != nil {
			slog.Errorln(err)
		}
		err = collect.Put("alerts.active_status",
			ts.Copy().Merge(opentsdb.TagSet{"status": "inactive"}),
			activeStatusCounts[alertName][false])
		if err != nil {
			slog.Errorln(err)
		}
	}
}
Exemple #7
0
func main() {
	flag.Parse()
	if *flagToToml != "" {
		toToml(*flagToToml)
		fmt.Println("toml conversion complete; remove all empty values by hand (empty strings, 0)")
		return
	}
	if *flagPrint || *flagDebug {
		slog.Set(&slog.StdLog{Log: log.New(os.Stdout, "", log.LstdFlags)})
	}
	if *flagVersion {
		fmt.Println(version.GetVersionInfo("scollector"))
		os.Exit(0)
	}
	for _, m := range mains {
		m()
	}
	conf := readConf()
	if *flagHost != "" {
		conf.Host = *flagHost
	}
	if *flagFilter != "" {
		conf.Filter = strings.Split(*flagFilter, ",")
	}
	if !conf.Tags.Valid() {
		slog.Fatalf("invalid tags: %v", conf.Tags)
	} else if conf.Tags["host"] != "" {
		slog.Fatalf("host not supported in custom tags, use Hostname instead")
	}
	if conf.PProf != "" {
		go func() {
			slog.Infof("Starting pprof at http://%s/debug/pprof/", conf.PProf)
			slog.Fatal(http.ListenAndServe(conf.PProf, nil))
		}()
	}
	collectors.AddTags = conf.Tags
	util.FullHostname = conf.FullHost
	util.Set()
	if conf.Hostname != "" {
		util.Hostname = conf.Hostname
	}
	if err := collect.SetHostname(util.Hostname); err != nil {
		slog.Fatal(err)
	}
	if conf.ColDir != "" {
		collectors.InitPrograms(conf.ColDir)
	}
	var err error
	check := func(e error) {
		if e != nil {
			err = e
		}
	}
	collectors.Init(conf)
	for _, r := range conf.MetricFilters {
		check(collectors.AddMetricFilters(r))
	}
	for _, rmq := range conf.RabbitMQ {
		check(collectors.RabbitMQ(rmq.URL))
	}
	for _, cfg := range conf.SNMP {
		check(collectors.SNMP(cfg, conf.MIBS))
	}
	for _, i := range conf.ICMP {
		check(collectors.ICMP(i.Host))
	}
	for _, a := range conf.AWS {
		check(collectors.AWS(a.AccessKey, a.SecretKey, a.Region))
	}
	for _, v := range conf.Vsphere {
		check(collectors.Vsphere(v.User, v.Password, v.Host))
	}
	for _, p := range conf.Process {
		check(collectors.AddProcessConfig(p))
	}
	for _, p := range conf.ProcessDotNet {
		check(collectors.AddProcessDotNetConfig(p))
	}
	for _, h := range conf.HTTPUnit {
		if h.TOML != "" {
			check(collectors.HTTPUnitTOML(h.TOML))
		}
		if h.Hiera != "" {
			check(collectors.HTTPUnitHiera(h.Hiera))
		}
	}
	for _, r := range conf.Riak {
		check(collectors.Riak(r.URL))
	}

	for _, x := range conf.ExtraHop {
		check(collectors.ExtraHop(x.Host, x.APIKey, x.FilterBy, x.FilterPercent))
	}

	if err != nil {
		slog.Fatal(err)
	}
	collectors.KeepalivedCommunity = conf.KeepalivedCommunity
	// Add all process collectors. This is platform specific.
	collectors.WatchProcesses()
	collectors.WatchProcessesDotNet()

	if *flagFake > 0 {
		collectors.InitFake(*flagFake)
	}
	collect.Debug = *flagDebug
	util.Debug = *flagDebug
	collect.DisableDefaultCollectors = conf.DisableSelf
	c := collectors.Search(conf.Filter)
	if len(c) == 0 {
		slog.Fatalf("Filter %v matches no collectors.", conf.Filter)
	}
	for _, col := range c {
		col.Init()
	}
	u, err := parseHost(conf.Host)
	if *flagList {
		list(c)
		return
	} else if *flagPrint {
		u = &url.URL{Scheme: "http", Host: "localhost:0"}
	} else if err != nil {
		slog.Fatalf("invalid host %v: %v", conf.Host, err)
	}
	freq := time.Second * time.Duration(conf.Freq)
	if freq <= 0 {
		slog.Fatal("freq must be > 0")
	}
	collectors.DefaultFreq = freq
	collect.Freq = freq
	if conf.BatchSize < 0 {
		slog.Fatal("BatchSize must be > 0")
	}
	if conf.BatchSize != 0 {
		collect.BatchSize = conf.BatchSize
	}
	collect.Tags = conf.Tags.Copy().Merge(opentsdb.TagSet{"os": runtime.GOOS})
	if *flagPrint {
		collect.Print = true
	}
	if !*flagDisableMetadata {
		if err := metadata.Init(u, *flagDebug); err != nil {
			slog.Fatal(err)
		}
	}
	cdp, cquit := collectors.Run(c)
	if u != nil {
		slog.Infoln("OpenTSDB host:", u)
	}
	if err := collect.InitChan(u, "scollector", cdp); err != nil {
		slog.Fatal(err)
	}
	if version.VersionDate != "" {
		v, err := strconv.ParseInt(version.VersionDate, 10, 64)
		if err == nil {
			go func() {
				metadata.AddMetricMeta("scollector.version", metadata.Gauge, metadata.None,
					"Scollector version number, which indicates when scollector was built.")
				for {
					if err := collect.Put("version", collect.Tags, v); err != nil {
						slog.Error(err)
					}
					time.Sleep(time.Hour)
				}
			}()
		}
	}
	if *flagBatchSize > 0 {
		collect.BatchSize = *flagBatchSize
	}
	go func() {
		const maxMem = 500 * 1024 * 1024 // 500MB
		var m runtime.MemStats
		for range time.Tick(time.Minute) {
			runtime.ReadMemStats(&m)
			if m.Alloc > maxMem {
				panic("memory max reached")
			}
		}
	}()
	sChan := make(chan os.Signal)
	signal.Notify(sChan, os.Interrupt)
	<-sChan
	close(cquit)
	// try to flush all datapoints on sigterm, but quit after 5 seconds no matter what.
	time.AfterFunc(5*time.Second, func() {
		os.Exit(0)
	})
	collect.Flush()
}
Exemple #8
0
func main() {
	flag.Parse()
	if *flagToToml != "" {
		toToml(*flagToToml)
		fmt.Println("toml conversion complete; remove all empty values by hand (empty strings, 0)")
		return
	}
	if *flagPrint || *flagDebug {
		slog.Set(&slog.StdLog{Log: log.New(os.Stdout, "", log.LstdFlags)})
	}
	if *flagVersion {
		fmt.Println(version.GetVersionInfo("scollector"))
		os.Exit(0)
	}
	for _, m := range mains {
		m()
	}
	conf := readConf()
	ua := "Scollector/" + version.ShortVersion()
	if conf.UserAgentMessage != "" {
		ua += fmt.Sprintf(" (%s)", conf.UserAgentMessage)
	}
	client := &http.Client{
		Transport: &scollectorHTTPTransport{
			ua,
			&httpcontrol.Transport{
				RequestTimeout: time.Minute,
			},
		},
	}
	http.DefaultClient = client
	collect.DefaultClient = client
	if *flagHost != "" {
		conf.Host = *flagHost
	}
	if *flagNtlm {
		conf.UseNtlm = *flagNtlm
	}
	if *flagFilter != "" {
		conf.Filter = strings.Split(*flagFilter, ",")
	}
	if !conf.Tags.Valid() {
		slog.Fatalf("invalid tags: %v", conf.Tags)
	} else if conf.Tags["host"] != "" {
		slog.Fatalf("host not supported in custom tags, use Hostname instead")
	}
	if conf.PProf != "" {
		go func() {
			slog.Infof("Starting pprof at http://%s/debug/pprof/", conf.PProf)
			slog.Fatal(http.ListenAndServe(conf.PProf, nil))
		}()
	}
	collectors.AddTags = conf.Tags
	util.FullHostname = conf.FullHost
	util.Set()
	if conf.Hostname != "" {
		util.Hostname = conf.Hostname
	}
	if err := collect.SetHostname(util.Hostname); err != nil {
		slog.Fatal(err)
	}
	if conf.ColDir != "" {
		collectors.InitPrograms(conf.ColDir)
	}
	if conf.SNMPTimeout > 0 {
		snmp.Timeout = conf.SNMPTimeout
	}
	var err error
	check := func(e error) {
		if e != nil {
			err = e
		}
	}
	collectors.Init(conf)
	for _, r := range conf.MetricFilters {
		slog.Infof("Adding MetricFilter: %v\n", r)
		check(collectors.AddMetricFilters(r))
	}
	for _, rmq := range conf.RabbitMQ {
		check(collectors.RabbitMQ(rmq.URL))
	}
	for _, cfg := range conf.SNMP {
		check(collectors.SNMP(cfg, conf.MIBS))
	}
	for _, i := range conf.ICMP {
		check(collectors.ICMP(i.Host))
	}
	for _, a := range conf.AWS {
		check(collectors.AWS(a.AccessKey, a.SecretKey, a.Region, a.BillingProductCodesRegex, a.BillingBucketName, a.BillingBucketPath, a.BillingPurgeDays))
	}
	for _, ea := range conf.AzureEA {
		check(collectors.AzureEABilling(ea.EANumber, ea.APIKey, ea.LogBillingDetails))
	}
	for _, v := range conf.Vsphere {
		check(collectors.Vsphere(v.User, v.Password, v.Host))
	}
	for _, p := range conf.Process {
		check(collectors.AddProcessConfig(p))
	}
	for _, p := range conf.ProcessDotNet {
		check(collectors.AddProcessDotNetConfig(p))
	}
	for _, h := range conf.HTTPUnit {
		var freq time.Duration
		var parseerr error
		if h.Freq == "" {
			freq = time.Minute * 5
		} else {
			freq, parseerr = time.ParseDuration(h.Freq)
			if parseerr != nil {
				slog.Fatal(parseerr)
			}
			if freq < time.Second {
				slog.Fatalf("Invalid HTTPUnit frequency %s, cannot be less than 1 second.", h.Freq)
			}
		}
		if h.TOML != "" {
			check(collectors.HTTPUnitTOML(h.TOML, freq))
		}
		if h.Hiera != "" {
			check(collectors.HTTPUnitHiera(h.Hiera, freq))
		}
	}
	for _, r := range conf.Riak {
		check(collectors.Riak(r.URL))
	}

	for _, x := range conf.ExtraHop {
		check(collectors.ExtraHop(x.Host, x.APIKey, x.FilterBy, x.FilterPercent, x.AdditionalMetrics, x.CertificateSubjectMatch, x.CertificateActivityGroup))
	}

	if err != nil {
		slog.Fatal(err)
	}
	collectors.KeepalivedCommunity = conf.KeepalivedCommunity
	// Add all process collectors. This is platform specific.
	collectors.WatchProcesses()
	collectors.WatchProcessesDotNet()

	if *flagFake > 0 {
		collectors.InitFake(*flagFake)
	}
	collect.Debug = *flagDebug
	util.Debug = *flagDebug
	collect.DisableDefaultCollectors = conf.DisableSelf
	c := collectors.Search(conf.Filter)
	if len(c) == 0 {
		slog.Fatalf("Filter %v matches no collectors.", conf.Filter)
	}
	for _, col := range c {
		col.Init()
	}
	err = collectors.AddTagOverrides(c, conf.TagOverride)
	if err != nil {
		slog.Fatalf("Error adding tag overrides: %s", err)
	}
	u, err := parseHost(conf.Host)
	if *flagList {
		list(c)
		return
	} else if *flagPrint {
		u = &url.URL{Scheme: "http", Host: "localhost:0"}
	} else if err != nil {
		slog.Fatalf("invalid host %v: %v", conf.Host, err)
	}
	freq := time.Second * time.Duration(conf.Freq)
	if freq <= 0 {
		slog.Fatal("freq must be > 0")
	}
	collectors.DefaultFreq = freq
	collect.Freq = freq
	if conf.BatchSize < 0 {
		slog.Fatal("BatchSize must be > 0")
	}
	if conf.BatchSize != 0 {
		collect.BatchSize = conf.BatchSize
	}
	collect.Tags = conf.Tags.Copy().Merge(opentsdb.TagSet{"os": runtime.GOOS})
	if *flagPrint {
		collect.Print = true
	}
	if !*flagDisableMetadata {
		if err := metadata.Init(u, *flagDebug); err != nil {
			slog.Fatal(err)
		}
	}
	cdp, cquit := collectors.Run(c)
	if u != nil {
		slog.Infoln("OpenTSDB host:", u)
	}
	collect.UseNtlm = conf.UseNtlm
	if err := collect.InitChan(u, "scollector", cdp); err != nil {
		slog.Fatal(err)
	}
	if collect.DisableDefaultCollectors == false && version.VersionDate != "" {
		v, err := strconv.ParseInt(version.VersionDate, 10, 64)
		if err == nil {
			go func() {
				metadata.AddMetricMeta("scollector.version", metadata.Gauge, metadata.None,
					"Scollector version number, which indicates when scollector was built.")
				for {
					if err := collect.Put("version", collect.Tags, v); err != nil {
						slog.Error(err)
					}
					time.Sleep(time.Hour)
				}
			}()
		}
	}
	if *flagBatchSize > 0 {
		collect.BatchSize = *flagBatchSize
	}

	if conf.MaxQueueLen != 0 {
		if conf.MaxQueueLen < collect.BatchSize {
			slog.Fatalf("MaxQueueLen must be >= %d (BatchSize)", collect.BatchSize)
		}
		collect.MaxQueueLen = conf.MaxQueueLen
	}
	maxMemMB := uint64(500)
	if conf.MaxMem != 0 {
		maxMemMB = conf.MaxMem
	}
	go func() {
		var m runtime.MemStats
		for range time.Tick(time.Second * 30) {
			runtime.ReadMemStats(&m)
			allocMB := m.Alloc / 1024 / 1024
			if allocMB > maxMemMB {
				slog.Fatalf("memory max runtime reached: (current alloc: %v megabytes, max: %v megabytes)", allocMB, maxMemMB)
			}
			//See proccess_windows.go and process_linux.go for total process memory usage.
			//Note that in linux the rss metric includes shared pages, where as in
			//Windows the private working set does not include shared memory.
			//Total memory used seems to scale linerarly with m.Alloc.
			//But we want this to catch a memory leak outside the runtime (WMI/CGO).
			//So for now just add any runtime allocations to the allowed total limit.
			maxMemTotalMB := maxMemMB + allocMB
			if collectors.TotalScollectorMemoryMB > maxMemTotalMB {
				slog.Fatalf("memory max total reached: (current total: %v megabytes, current runtime alloc: %v megabytes, max: %v megabytes)", collectors.TotalScollectorMemoryMB, allocMB, maxMemTotalMB)
			}
		}
	}()
	sChan := make(chan os.Signal)
	signal.Notify(sChan, os.Interrupt)
	<-sChan
	close(cquit)
	// try to flush all datapoints on sigterm, but quit after 5 seconds no matter what.
	time.AfterFunc(5*time.Second, func() {
		os.Exit(0)
	})
	collect.Flush()
}
Exemple #9
0
// CollectStates sends various state information to bosun with collect.
func (s *Schedule) CollectStates() {
	// [AlertName][Severity]Count
	severityCounts := make(map[string]map[string]int64)
	abnormalCounts := make(map[string]map[string]int64)
	ackStatusCounts := make(map[string]map[bool]int64)
	activeStatusCounts := make(map[string]map[bool]int64)
	// Initalize the Counts
	for _, alert := range s.Conf.Alerts {
		severityCounts[alert.Name] = make(map[string]int64)
		abnormalCounts[alert.Name] = make(map[string]int64)
		var i Status
		for i = 1; i.String() != "none"; i++ {
			severityCounts[alert.Name][i.String()] = 0
			abnormalCounts[alert.Name][i.String()] = 0
		}
		ackStatusCounts[alert.Name] = make(map[bool]int64)
		activeStatusCounts[alert.Name] = make(map[bool]int64)
		ackStatusCounts[alert.Name][false] = 0
		activeStatusCounts[alert.Name][false] = 0
		ackStatusCounts[alert.Name][true] = 0
		activeStatusCounts[alert.Name][true] = 0
	}
	for _, state := range s.status {
		if !state.Open {
			continue
		}
		severity := state.Status().String()
		lastAbnormal := state.AbnormalStatus().String()
		severityCounts[state.Alert][severity]++
		abnormalCounts[state.Alert][lastAbnormal]++
		ackStatusCounts[state.Alert][state.NeedAck]++
		activeStatusCounts[state.Alert][state.IsActive()]++
	}
	for alertName := range severityCounts {
		ts := opentsdb.TagSet{"alert": alertName}
		// The tagset of the alert is not included because there is no way to
		// store the string of a group in OpenTSBD in a parsable way. This is
		// because any delimiter we chose could also be part of a tag key or tag
		// value.
		for severity := range severityCounts[alertName] {
			err := collect.Put("alerts.current_severity",
				ts.Copy().Merge(opentsdb.TagSet{"severity": severity}),
				severityCounts[alertName][severity])
			if err != nil {
				slog.Errorln(err)
			}
			err = collect.Put("alerts.last_abnormal_severity",
				ts.Copy().Merge(opentsdb.TagSet{"severity": severity}),
				abnormalCounts[alertName][severity])
			if err != nil {
				slog.Errorln(err)
			}
		}
		err := collect.Put("alerts.acknowledgement_status",
			ts.Copy().Merge(opentsdb.TagSet{"status": "unacknowledged"}),
			ackStatusCounts[alertName][true])
		err = collect.Put("alerts.acknowledgement_status",
			ts.Copy().Merge(opentsdb.TagSet{"status": "acknowledged"}),
			ackStatusCounts[alertName][false])
		if err != nil {
			slog.Errorln(err)
		}
		err = collect.Put("alerts.active_status",
			ts.Copy().Merge(opentsdb.TagSet{"status": "active"}),
			activeStatusCounts[alertName][true])
		if err != nil {
			slog.Errorln(err)
		}
		err = collect.Put("alerts.active_status",
			ts.Copy().Merge(opentsdb.TagSet{"status": "inactive"}),
			activeStatusCounts[alertName][false])
		if err != nil {
			slog.Errorln(err)
		}
	}
}
Exemple #10
0
func main() {
	flag.Parse()
	if *flagToToml != "" {
		toToml(*flagToToml)
		fmt.Println("toml conversion complete; remove all empty values by hand (empty strings, 0)")
		return
	}
	if *flagPrint || *flagDebug {
		slog.Set(&slog.StdLog{Log: log.New(os.Stdout, "", log.LstdFlags)})
	}
	if *flagVersion {
		fmt.Println(version.GetVersionInfo("scollector"))
		os.Exit(0)
	}
	for _, m := range mains {
		m()
	}
	conf := readConf()
	if *flagHost != "" {
		conf.Host = *flagHost
	}
	if *flagFilter != "" {
		conf.Filter = strings.Split(*flagFilter, ",")
	}
	if !conf.Tags.Valid() {
		slog.Fatalf("invalid tags: %v", conf.Tags)
	} else if conf.Tags["host"] != "" {
		slog.Fatalf("host not supported in custom tags, use Hostname instead")
	}
	collectors.AddTags = conf.Tags
	util.FullHostname = conf.FullHost
	util.Set()
	if conf.Hostname != "" {
		util.Hostname = conf.Hostname
		if err := collect.SetHostname(conf.Hostname); err != nil {
			slog.Fatal(err)
		}
	}
	if conf.ColDir != "" {
		collectors.InitPrograms(conf.ColDir)
	}
	var err error
	check := func(e error) {
		if e != nil {
			err = e
		}
	}
	for _, h := range conf.HAProxy {
		for _, i := range h.Instances {
			collectors.HAProxy(h.User, h.Password, i.Tier, i.URL)
		}
	}
	for _, s := range conf.SNMP {
		check(collectors.SNMP(s.Community, s.Host))
	}
	for _, i := range conf.ICMP {
		check(collectors.ICMP(i.Host))
	}
	for _, a := range conf.AWS {
		check(collectors.AWS(a.AccessKey, a.SecretKey, a.Region))
	}
	for _, v := range conf.Vsphere {
		check(collectors.Vsphere(v.User, v.Password, v.Host))
	}
	for _, p := range conf.Process {
		check(collectors.AddProcessConfig(p))
	}
	for _, h := range conf.HTTPUnit {
		if h.TOML != "" {
			check(collectors.HTTPUnitTOML(h.TOML))
		}
		if h.Hiera != "" {
			check(collectors.HTTPUnitHiera(h.Hiera))
		}
	}
	if err != nil {
		slog.Fatal(err)
	}
	collectors.KeepalivedCommunity = conf.KeepalivedCommunity
	// Add all process collectors. This is platform specific.
	collectors.WatchProcesses()
	collectors.WatchProcessesDotNet()

	if *flagFake > 0 {
		collectors.InitFake(*flagFake)
	}
	collect.Debug = *flagDebug
	util.Debug = *flagDebug
	collect.DisableDefaultCollectors = conf.DisableSelf
	c := collectors.Search(conf.Filter)
	if len(c) == 0 {
		slog.Fatalf("Filter %v matches no collectors.", conf.Filter)
	}
	for _, col := range c {
		col.Init()
	}
	u, err := parseHost(conf.Host)
	if *flagList {
		list(c)
		return
	} else if err != nil {
		slog.Fatalf("invalid host %v: %v", conf.Host, err)
	}
	freq := time.Second * time.Duration(conf.Freq)
	if freq <= 0 {
		slog.Fatal("freq must be > 0")
	}
	collectors.DefaultFreq = freq
	collect.Freq = freq
	collect.Tags = opentsdb.TagSet{"os": runtime.GOOS}
	if *flagPrint {
		collect.Print = true
	}
	if !*flagDisableMetadata {
		if err := metadata.Init(u, *flagDebug); err != nil {
			slog.Fatal(err)
		}
	}
	cdp := collectors.Run(c)
	if u != nil {
		slog.Infoln("OpenTSDB host:", u)
	}
	if err := collect.InitChan(u, "scollector", cdp); err != nil {
		slog.Fatal(err)
	}

	if version.VersionDate != "" {
		v, err := strconv.ParseInt(version.VersionDate, 10, 64)
		if err == nil {
			go func() {
				metadata.AddMetricMeta("scollector.version", metadata.Gauge, metadata.None,
					"Scollector version number, which indicates when scollector was built.")
				for {
					if err := collect.Put("version", collect.Tags, v); err != nil {
						slog.Error(err)
					}
					time.Sleep(time.Hour)
				}
			}()
		}
	}
	if *flagBatchSize > 0 {
		collect.BatchSize = *flagBatchSize
	}
	go func() {
		const maxMem = 500 * 1024 * 1024 // 500MB
		var m runtime.MemStats
		for range time.Tick(time.Minute) {
			runtime.ReadMemStats(&m)
			if m.Alloc > maxMem {
				panic("memory max reached")
			}
		}
	}()
	select {}
}