func watchConfig(fileName string, mapper *metricMapper) { watcher, err := fsnotify.NewWatcher() if err != nil { log.Fatal(err) } err = watcher.WatchFlags(fileName, fsnotify.FSN_MODIFY) if err != nil { log.Fatal(err) } for { select { case ev := <-watcher.Event: log.Infof("Config file changed (%s), attempting reload", ev) err = mapper.initFromFile(fileName) if err != nil { log.Errorln("Error reloading config:", err) configLoads.WithLabelValues("failure").Inc() } else { log.Infoln("Config reloaded successfully") configLoads.WithLabelValues("success").Inc() } // Re-add the file watcher since it can get lost on some changes. E.g. // saving a file with vim results in a RENAME-MODIFY-DELETE event // sequence, after which the newly written file is no longer watched. err = watcher.WatchFlags(fileName, fsnotify.FSN_MODIFY) case err := <-watcher.Error: log.Errorln("Error watching config:", err) } } }
func (s *memorySeriesStorage) loop() { checkpointTimer := time.NewTimer(s.checkpointInterval) dirtySeriesCount := 0 defer func() { checkpointTimer.Stop() log.Info("Maintenance loop stopped.") close(s.loopStopped) }() memoryFingerprints := s.cycleThroughMemoryFingerprints() archivedFingerprints := s.cycleThroughArchivedFingerprints() loop: for { select { case <-s.loopStopping: break loop case <-checkpointTimer.C: err := s.persistence.checkpointSeriesMapAndHeads(s.fpToSeries, s.fpLocker) if err != nil { log.Errorln("Error while checkpointing:", err) } else { dirtySeriesCount = 0 } // If a checkpoint takes longer than checkpointInterval, unluckily timed // combination with the Reset(0) call below can lead to a case where a // time is lurking in C leading to repeated checkpointing without break. select { case <-checkpointTimer.C: // Get rid of the lurking time. default: } checkpointTimer.Reset(s.checkpointInterval) case fp := <-memoryFingerprints: if s.maintainMemorySeries(fp, model.Now().Add(-s.dropAfter)) { dirtySeriesCount++ // Check if we have enough "dirty" series so that we need an early checkpoint. // However, if we are already behind persisting chunks, creating a checkpoint // would be counterproductive, as it would slow down chunk persisting even more, // while in a situation like that, where we are clearly lacking speed of disk // maintenance, the best we can do for crash recovery is to persist chunks as // quickly as possible. So only checkpoint if the urgency score is < 1. if dirtySeriesCount >= s.checkpointDirtySeriesLimit && s.calculatePersistenceUrgencyScore() < 1 { checkpointTimer.Reset(0) } } case fp := <-archivedFingerprints: s.maintainArchivedSeries(fp, model.Now().Add(-s.dropAfter)) } } // Wait until both channels are closed. for range memoryFingerprints { } for range archivedFingerprints { } }
func (dms *DiskMetricStore) loop(persistenceInterval time.Duration) { lastPersist := time.Now() persistScheduled := false lastWrite := time.Time{} persistDone := make(chan time.Time) var persistTimer *time.Timer checkPersist := func() { if !persistScheduled && lastWrite.After(lastPersist) { persistTimer = time.AfterFunc( persistenceInterval-lastWrite.Sub(lastPersist), func() { persistStarted := time.Now() if err := dms.persist(); err != nil { log.Errorln("Error persisting metrics:", err) } else { log.Infof( "Metrics persisted to '%s'.", dms.persistenceFile, ) } persistDone <- persistStarted }, ) persistScheduled = true } } for { select { case wr := <-dms.writeQueue: dms.processWriteRequest(wr) lastWrite = time.Now() checkPersist() case lastPersist = <-persistDone: persistScheduled = false checkPersist() // In case something has been written in the meantime. case <-dms.drain: // Prevent a scheduled persist from firing later. if persistTimer != nil { persistTimer.Stop() } // Now draining... for { select { case wr := <-dms.writeQueue: dms.processWriteRequest(wr) default: dms.done <- dms.persist() return } } } } }
// Start implements Storage. func (s *memorySeriesStorage) Start() (err error) { var syncStrategy syncStrategy switch s.options.SyncStrategy { case Never: syncStrategy = func() bool { return false } case Always: syncStrategy = func() bool { return true } case Adaptive: syncStrategy = func() bool { return s.calculatePersistenceUrgencyScore() < 1 } default: panic("unknown sync strategy") } var p *persistence p, err = newPersistence( s.options.PersistenceStoragePath, s.options.Dirty, s.options.PedanticChecks, syncStrategy, s.options.MinShrinkRatio, ) if err != nil { return err } s.persistence = p // Persistence must start running before loadSeriesMapAndHeads() is called. go s.persistence.run() defer func() { if err != nil { if e := p.close(); e != nil { log.Errorln("Error closing persistence:", e) } } }() log.Info("Loading series map and head chunks...") s.fpToSeries, s.numChunksToPersist, err = p.loadSeriesMapAndHeads() if err != nil { return err } log.Infof("%d series loaded.", s.fpToSeries.length()) s.numSeries.Set(float64(s.fpToSeries.length())) s.mapper, err = newFPMapper(s.fpToSeries, p) if err != nil { return err } go s.handleEvictList() go s.handleQuarantine() go s.logThrottling() go s.loop() return nil }
// NewDiskMetricStore returns a DiskMetricStore ready to use. To cleanly shut it // down and free resources, the Shutdown() method has to be called. If // persistenceFile is the empty string, no persisting to disk will // happen. Otherwise, a file of that name is used for persisting metrics to // disk. If the file already exists, metrics are read from it as part of the // start-up. Persisting is happening upon shutdown and after every write action, // but the latter will only happen persistenceDuration after the previous // persisting. func NewDiskMetricStore( persistenceFile string, persistenceInterval time.Duration, ) *DiskMetricStore { dms := &DiskMetricStore{ writeQueue: make(chan WriteRequest, writeQueueCapacity), drain: make(chan struct{}), done: make(chan error), metricGroups: GroupingKeyToMetricGroup{}, persistenceFile: persistenceFile, } if err := dms.restore(); err != nil { log.Errorln("Could not load persisted metrics:", err) log.Info("Retrying assuming legacy format for persisted metrics...") if err := dms.legacyRestore(); err != nil { log.Errorln("Could not load persisted metrics in legacy format: ", err) } } go dms.loop(persistenceInterval) return dms }
func (s *memorySeriesStorage) loop() { checkpointTimer := time.NewTimer(s.checkpointInterval) dirtySeriesCount := 0 defer func() { checkpointTimer.Stop() log.Info("Maintenance loop stopped.") close(s.loopStopped) }() memoryFingerprints := s.cycleThroughMemoryFingerprints() archivedFingerprints := s.cycleThroughArchivedFingerprints() loop: for { select { case <-s.loopStopping: break loop case <-checkpointTimer.C: err := s.persistence.checkpointSeriesMapAndHeads(s.fpToSeries, s.fpLocker) if err != nil { log.Errorln("Error while checkpointing:", err) } else { dirtySeriesCount = 0 } checkpointTimer.Reset(s.checkpointInterval) case fp := <-memoryFingerprints: if s.maintainMemorySeries(fp, model.Now().Add(-s.dropAfter)) { dirtySeriesCount++ // Check if we have enough "dirty" series so that we need an early checkpoint. // However, if we are already behind persisting chunks, creating a checkpoint // would be counterproductive, as it would slow down chunk persisting even more, // while in a situation like that, where we are clearly lacking speed of disk // maintenance, the best we can do for crash recovery is to persist chunks as // quickly as possible. So only checkpoint if the storage is not in "graceful // degradation mode". if dirtySeriesCount >= s.checkpointDirtySeriesLimit && !s.isDegraded() { checkpointTimer.Reset(0) } } case fp := <-archivedFingerprints: s.maintainArchivedSeries(fp, model.Now().Add(-s.dropAfter)) } } // Wait until both channels are closed. for range memoryFingerprints { } for range archivedFingerprints { } }
func (d *Device) recvResponse() { var n int buf := make([]byte, 4) for resp := range d.respChan { err := d.completeCommand(resp) if err != nil { log.Errorf("error completing command: %s", err) return } /* Tell the fd there's something new */ n, err = unix.Write(d.uioFd, buf) if n == -1 && err != nil { log.Errorln("poll write") return } } }
// Check and update the exporters query maps if the version has changed. func (e *Exporter) checkMapVersions(ch chan<- prometheus.Metric, db *sql.DB) error { log.Debugln("Querying Postgres Version") versionRow := db.QueryRow("SELECT version();") var versionString string err := versionRow.Scan(&versionString) if err != nil { return errors.New(fmt.Sprintln("Error scanning version string:", err)) } semanticVersion, err := parseVersion(versionString) // Check if semantic version changed and recalculate maps if needed. if semanticVersion.NE(e.lastMapVersion) || e.variableMap == nil || e.metricMap == nil { log.Infoln("Semantic Version Changed:", e.lastMapVersion.String(), "->", semanticVersion.String()) e.mappingMtx.Lock() e.variableMap = makeDescMap(semanticVersion, variableMaps) e.metricMap = makeDescMap(semanticVersion, metricMaps) e.queryOverrides = makeQueryOverrideMap(semanticVersion, queryOverrides) e.lastMapVersion = semanticVersion if e.userQueriesPath != "" { if err := addQueries(e.userQueriesPath, semanticVersion, e.metricMap, e.queryOverrides); err != nil { log.Errorln("Failed to reload user queries:", e.userQueriesPath, err) } } e.mappingMtx.Unlock() } // Output the version as a special metric versionDesc := prometheus.NewDesc(fmt.Sprintf("%s_%s", namespace, staticLabelName), "Version string as reported by postgres", []string{"version", "short_version"}, nil) ch <- prometheus.MustNewConstMetric(versionDesc, prometheus.UntypedValue, 1, versionString, semanticVersion.String()) return nil }
// Main manages the startup and shutdown lifecycle of the entire Prometheus server. func Main() int { if err := parse(os.Args[1:]); err != nil { return 2 } printVersion() if cfg.printVersion { return 0 } var reloadables []Reloadable var ( memStorage = local.NewMemorySeriesStorage(&cfg.storage) remoteStorage = remote.New(&cfg.remote) sampleAppender = storage.Fanout{memStorage} ) if remoteStorage != nil { sampleAppender = append(sampleAppender, remoteStorage) reloadables = append(reloadables, remoteStorage) } var ( notificationHandler = notification.New(&cfg.notification) targetManager = retrieval.NewTargetManager(sampleAppender) queryEngine = promql.NewEngine(memStorage, &cfg.queryEngine) ) ruleManager := rules.NewManager(&rules.ManagerOptions{ SampleAppender: sampleAppender, NotificationHandler: notificationHandler, QueryEngine: queryEngine, ExternalURL: cfg.web.ExternalURL, }) flags := map[string]string{} cfg.fs.VisitAll(func(f *flag.Flag) { flags[f.Name] = f.Value.String() }) status := &web.PrometheusStatus{ TargetPools: targetManager.Pools, Rules: ruleManager.Rules, Flags: flags, Birth: time.Now(), } webHandler := web.New(memStorage, queryEngine, ruleManager, status, &cfg.web) reloadables = append(reloadables, status, targetManager, ruleManager, webHandler, notificationHandler) if !reloadConfig(cfg.configFile, reloadables...) { return 1 } // Wait for reload or termination signals. Start the handler for SIGHUP as // early as possible, but ignore it until we are ready to handle reloading // our config. hup := make(chan os.Signal) hupReady := make(chan bool) signal.Notify(hup, syscall.SIGHUP) go func() { <-hupReady for { select { case <-hup: case <-webHandler.Reload(): } reloadConfig(cfg.configFile, reloadables...) } }() // Start all components. if err := memStorage.Start(); err != nil { log.Errorln("Error opening memory series storage:", err) return 1 } defer func() { if err := memStorage.Stop(); err != nil { log.Errorln("Error stopping storage:", err) } }() if remoteStorage != nil { prometheus.MustRegister(remoteStorage) go remoteStorage.Run() defer remoteStorage.Stop() } // The storage has to be fully initialized before registering. prometheus.MustRegister(memStorage) prometheus.MustRegister(notificationHandler) prometheus.MustRegister(configSuccess) prometheus.MustRegister(configSuccessTime) defer ruleManager.Stop() go notificationHandler.Run() defer notificationHandler.Stop() go targetManager.Run() defer targetManager.Stop() defer queryEngine.Stop() go webHandler.Run() // Wait for reload or termination signals. close(hupReady) // Unblock SIGHUP handler. term := make(chan os.Signal) signal.Notify(term, os.Interrupt, syscall.SIGTERM) select { case <-term: log.Warn("Received SIGTERM, exiting gracefully...") case <-webHandler.Quit(): log.Warn("Received termination request via web service, exiting gracefully...") case err := <-webHandler.ListenError(): log.Errorln("Error starting web server, exiting gracefully:", err) } log.Info("See you next time!") return 0 }
// Main manages the startup and shutdown lifecycle of the entire Prometheus server. func Main() int { if err := parse(os.Args[1:]); err != nil { log.Error(err) return 2 } if cfg.printVersion { fmt.Fprintln(os.Stdout, version.Print("prometheus")) return 0 } log.Infoln("Starting prometheus", version.Info()) log.Infoln("Build context", version.BuildContext()) var reloadables []Reloadable var ( memStorage = local.NewMemorySeriesStorage(&cfg.storage) remoteStorage = remote.New(&cfg.remote) sampleAppender = storage.Fanout{memStorage} ) if remoteStorage != nil { sampleAppender = append(sampleAppender, remoteStorage) reloadables = append(reloadables, remoteStorage) } var ( notifier = notifier.New(&cfg.notifier) targetManager = retrieval.NewTargetManager(sampleAppender) queryEngine = promql.NewEngine(memStorage, &cfg.queryEngine) ) ruleManager := rules.NewManager(&rules.ManagerOptions{ SampleAppender: sampleAppender, Notifier: notifier, QueryEngine: queryEngine, ExternalURL: cfg.web.ExternalURL, }) flags := map[string]string{} cfg.fs.VisitAll(func(f *flag.Flag) { flags[f.Name] = f.Value.String() }) version := &web.PrometheusVersion{ Version: version.Version, Revision: version.Revision, Branch: version.Branch, BuildUser: version.BuildUser, BuildDate: version.BuildDate, GoVersion: version.GoVersion, } webHandler := web.New(memStorage, queryEngine, targetManager, ruleManager, version, flags, &cfg.web) reloadables = append(reloadables, targetManager, ruleManager, webHandler, notifier) if !reloadConfig(cfg.configFile, reloadables...) { return 1 } // Wait for reload or termination signals. Start the handler for SIGHUP as // early as possible, but ignore it until we are ready to handle reloading // our config. hup := make(chan os.Signal) hupReady := make(chan bool) signal.Notify(hup, syscall.SIGHUP) go func() { <-hupReady for { select { case <-hup: case <-webHandler.Reload(): } reloadConfig(cfg.configFile, reloadables...) } }() // Start all components. The order is NOT arbitrary. if err := memStorage.Start(); err != nil { log.Errorln("Error opening memory series storage:", err) return 1 } defer func() { if err := memStorage.Stop(); err != nil { log.Errorln("Error stopping storage:", err) } }() if remoteStorage != nil { prometheus.MustRegister(remoteStorage) go remoteStorage.Run() defer remoteStorage.Stop() } // The storage has to be fully initialized before registering. prometheus.MustRegister(memStorage) prometheus.MustRegister(notifier) prometheus.MustRegister(configSuccess) prometheus.MustRegister(configSuccessTime) // The notifieris a dependency of the rule manager. It has to be // started before and torn down afterwards. go notifier.Run() defer notifier.Stop() go ruleManager.Run() defer ruleManager.Stop() go targetManager.Run() defer targetManager.Stop() // Shutting down the query engine before the rule manager will cause pending queries // to be canceled and ensures a quick shutdown of the rule manager. defer queryEngine.Stop() go webHandler.Run() // Wait for reload or termination signals. close(hupReady) // Unblock SIGHUP handler. term := make(chan os.Signal) signal.Notify(term, os.Interrupt, syscall.SIGTERM) select { case <-term: log.Warn("Received SIGTERM, exiting gracefully...") case <-webHandler.Quit(): log.Warn("Received termination request via web service, exiting gracefully...") case err := <-webHandler.ListenError(): log.Errorln("Error starting web server, exiting gracefully:", err) } log.Info("See you next time!") return 0 }
func (l *StatsDListener) handlePacket(packet []byte, e chan<- Events) { lines := strings.Split(string(packet), "\n") events := Events{} for _, line := range lines { if line == "" { continue } elements := strings.SplitN(line, ":", 2) if len(elements) < 2 || len(elements[0]) == 0 || !utf8.ValidString(line) { networkStats.WithLabelValues("malformed_line").Inc() log.Errorln("Bad line from StatsD:", line) continue } metric := elements[0] var samples []string if strings.Contains(elements[1], "|#") { // using datadog extensions, disable multi-metrics samples = elements[1:] } else { samples = strings.Split(elements[1], ":") } samples: for _, sample := range samples { components := strings.Split(sample, "|") samplingFactor := 1.0 if len(components) < 2 || len(components) > 4 { networkStats.WithLabelValues("malformed_component").Inc() log.Errorln("Bad component on line:", line) continue } valueStr, statType := components[0], components[1] value, err := strconv.ParseFloat(valueStr, 64) if err != nil { log.Errorf("Bad value %s on line: %s", valueStr, line) networkStats.WithLabelValues("malformed_value").Inc() continue } labels := map[string]string{} if len(components) >= 3 { for _, component := range components[2:] { if len(component) == 0 { log.Errorln("Empty component on line: ", line) networkStats.WithLabelValues("malformed_component").Inc() continue samples } } for _, component := range components[2:] { switch component[0] { case '@': if statType != "c" { log.Errorln("Illegal sampling factor for non-counter metric on line", line) networkStats.WithLabelValues("illegal_sample_factor").Inc() } samplingFactor, err = strconv.ParseFloat(component[1:], 64) if err != nil { log.Errorf("Invalid sampling factor %s on line %s", component[1:], line) networkStats.WithLabelValues("invalid_sample_factor").Inc() } if samplingFactor == 0 { samplingFactor = 1 } value /= samplingFactor case '#': labels = parseDogStatsDTagsToLabels(component) default: log.Errorf("Invalid sampling factor or tag section %s on line %s", components[2], line) networkStats.WithLabelValues("invalid_sample_factor").Inc() continue } } } event, err := buildEvent(statType, metric, value, labels) if err != nil { log.Errorf("Error building event on line %s: %s", line, err) networkStats.WithLabelValues("illegal_event").Inc() continue } events = append(events, event) networkStats.WithLabelValues("legal").Inc() } } e <- events }
// Main manages the startup and shutdown lifecycle of the entire Prometheus server. func Main() int { if err := parse(os.Args[1:]); err != nil { log.Error(err) return 2 } if cfg.printVersion { fmt.Fprintln(os.Stdout, version.Print("prometheus")) return 0 } log.Infoln("Starting prometheus", version.Info()) log.Infoln("Build context", version.BuildContext()) var ( sampleAppender = storage.Fanout{} reloadables []Reloadable ) var localStorage local.Storage switch cfg.localStorageEngine { case "persisted": localStorage = local.NewMemorySeriesStorage(&cfg.storage) sampleAppender = storage.Fanout{localStorage} case "none": localStorage = &local.NoopStorage{} default: log.Errorf("Invalid local storage engine %q", cfg.localStorageEngine) return 1 } remoteStorage, err := remote.New(&cfg.remote) if err != nil { log.Errorf("Error initializing remote storage: %s", err) return 1 } if remoteStorage != nil { sampleAppender = append(sampleAppender, remoteStorage) reloadables = append(reloadables, remoteStorage) } reloadableRemoteStorage := remote.NewConfigurable() sampleAppender = append(sampleAppender, reloadableRemoteStorage) reloadables = append(reloadables, reloadableRemoteStorage) var ( notifier = notifier.New(&cfg.notifier) targetManager = retrieval.NewTargetManager(sampleAppender) queryEngine = promql.NewEngine(localStorage, &cfg.queryEngine) ctx, cancelCtx = context.WithCancel(context.Background()) ) ruleManager := rules.NewManager(&rules.ManagerOptions{ SampleAppender: sampleAppender, Notifier: notifier, QueryEngine: queryEngine, Context: ctx, ExternalURL: cfg.web.ExternalURL, }) cfg.web.Context = ctx cfg.web.Storage = localStorage cfg.web.QueryEngine = queryEngine cfg.web.TargetManager = targetManager cfg.web.RuleManager = ruleManager cfg.web.Version = &web.PrometheusVersion{ Version: version.Version, Revision: version.Revision, Branch: version.Branch, BuildUser: version.BuildUser, BuildDate: version.BuildDate, GoVersion: version.GoVersion, } cfg.web.Flags = map[string]string{} cfg.fs.VisitAll(func(f *flag.Flag) { cfg.web.Flags[f.Name] = f.Value.String() }) webHandler := web.New(&cfg.web) reloadables = append(reloadables, targetManager, ruleManager, webHandler, notifier) if err := reloadConfig(cfg.configFile, reloadables...); err != nil { log.Errorf("Error loading config: %s", err) return 1 } // Wait for reload or termination signals. Start the handler for SIGHUP as // early as possible, but ignore it until we are ready to handle reloading // our config. hup := make(chan os.Signal) hupReady := make(chan bool) signal.Notify(hup, syscall.SIGHUP) go func() { <-hupReady for { select { case <-hup: if err := reloadConfig(cfg.configFile, reloadables...); err != nil { log.Errorf("Error reloading config: %s", err) } case rc := <-webHandler.Reload(): if err := reloadConfig(cfg.configFile, reloadables...); err != nil { log.Errorf("Error reloading config: %s", err) rc <- err } else { rc <- nil } } } }() // Start all components. The order is NOT arbitrary. if err := localStorage.Start(); err != nil { log.Errorln("Error opening memory series storage:", err) return 1 } defer func() { if err := localStorage.Stop(); err != nil { log.Errorln("Error stopping storage:", err) } }() if remoteStorage != nil { remoteStorage.Start() defer remoteStorage.Stop() } defer reloadableRemoteStorage.Stop() // The storage has to be fully initialized before registering. if instrumentedStorage, ok := localStorage.(prometheus.Collector); ok { prometheus.MustRegister(instrumentedStorage) } prometheus.MustRegister(notifier) prometheus.MustRegister(configSuccess) prometheus.MustRegister(configSuccessTime) // The notifier is a dependency of the rule manager. It has to be // started before and torn down afterwards. go notifier.Run() defer notifier.Stop() go ruleManager.Run() defer ruleManager.Stop() go targetManager.Run() defer targetManager.Stop() // Shutting down the query engine before the rule manager will cause pending queries // to be canceled and ensures a quick shutdown of the rule manager. defer cancelCtx() go webHandler.Run() // Wait for reload or termination signals. close(hupReady) // Unblock SIGHUP handler. term := make(chan os.Signal) signal.Notify(term, os.Interrupt, syscall.SIGTERM) select { case <-term: log.Warn("Received SIGTERM, exiting gracefully...") case <-webHandler.Quit(): log.Warn("Received termination request via web service, exiting gracefully...") case err := <-webHandler.ListenError(): log.Errorln("Error starting web server, exiting gracefully:", err) } log.Info("See you next time!") return 0 }
// Turn the MetricMap column mapping into a prometheus descriptor mapping. func makeDescMap(pgVersion semver.Version, metricMaps map[string]map[string]ColumnMapping) map[string]MetricMapNamespace { var metricMap = make(map[string]MetricMapNamespace) for namespace, mappings := range metricMaps { thisMap := make(map[string]MetricMap) // Get the constant labels var constLabels []string for columnName, columnMapping := range mappings { if columnMapping.usage == LABEL { constLabels = append(constLabels, columnName) } } for columnName, columnMapping := range mappings { // Check column version compatibility for the current map // Force to discard if not compatible. if columnMapping.supportedVersions != nil { if !columnMapping.supportedVersions(pgVersion) { // It's very useful to be able to see what columns are being // rejected. log.Debugln(columnName, "is being forced to discard due to version incompatibility.") thisMap[columnName] = MetricMap{ discard: true, conversion: func(in interface{}) (float64, bool) { return math.NaN(), true }, } continue } } // Determine how to convert the column based on its usage. switch columnMapping.usage { case DISCARD, LABEL: thisMap[columnName] = MetricMap{ discard: true, conversion: func(in interface{}) (float64, bool) { return math.NaN(), true }, } case COUNTER: thisMap[columnName] = MetricMap{ vtype: prometheus.CounterValue, desc: prometheus.NewDesc(fmt.Sprintf("%s_%s", namespace, columnName), columnMapping.description, constLabels, nil), conversion: func(in interface{}) (float64, bool) { return dbToFloat64(in) }, } case GAUGE: thisMap[columnName] = MetricMap{ vtype: prometheus.GaugeValue, desc: prometheus.NewDesc(fmt.Sprintf("%s_%s", namespace, columnName), columnMapping.description, constLabels, nil), conversion: func(in interface{}) (float64, bool) { return dbToFloat64(in) }, } case MAPPEDMETRIC: thisMap[columnName] = MetricMap{ vtype: prometheus.GaugeValue, desc: prometheus.NewDesc(fmt.Sprintf("%s_%s", namespace, columnName), columnMapping.description, constLabels, nil), conversion: func(in interface{}) (float64, bool) { text, ok := in.(string) if !ok { return math.NaN(), false } val, ok := columnMapping.mapping[text] if !ok { return math.NaN(), false } return val, true }, } case DURATION: thisMap[columnName] = MetricMap{ vtype: prometheus.GaugeValue, desc: prometheus.NewDesc(fmt.Sprintf("%s_%s_milliseconds", namespace, columnName), columnMapping.description, constLabels, nil), conversion: func(in interface{}) (float64, bool) { var durationString string switch t := in.(type) { case []byte: durationString = string(t) case string: durationString = t default: log.Errorln("DURATION conversion metric was not a string") return math.NaN(), false } if durationString == "-1" { return math.NaN(), false } d, err := time.ParseDuration(durationString) if err != nil { log.Errorln("Failed converting result to metric:", columnName, in, err) return math.NaN(), false } return float64(d / time.Millisecond), true }, } } } metricMap[namespace] = MetricMapNamespace{constLabels, thisMap} } return metricMap }
// Sends a single ICMP echo to an IP and returns success and latency information. // Borrowed from BrianBrazil's blackbox exporter func Ping(ip net.IP, maxRTT time.Duration) (success bool, latency time.Duration) { deadline := time.Now().Add(maxRTT) var socket *icmp.PacketConn var err error if isIPv4(ip) { socket, err = icmp.ListenPacket("ip4:icmp", "0.0.0.0") } else if isIPv6(ip) { socket, err = icmp.ListenPacket("ip6:ipv6-icmp", "::") } else { log.Errorln("IP did not match any known types?") return } if err != nil { log.Errorf("Error listening to socket: %s", err) return } defer socket.Close() seq := getICMPSequence() pid := os.Getpid() & 0xffff // Build the packet var wm icmp.Message if isIPv4(ip) { wm = icmp.Message{ Type: ipv4.ICMPTypeEcho, Code: 0, Body: &icmp.Echo{ ID: pid, Seq: int(seq), Data: []byte("poller_exporter"), }, } } else if isIPv6(ip) { wm = icmp.Message{ Type: ipv6.ICMPTypeEchoRequest, Code: 0, Body: &icmp.Echo{ ID: pid, Seq: int(seq), Data: []byte("poller_exporter"), }, } } else { log.Errorln("IP did not match any known types?") return } wb, err := wm.Marshal(nil) if err != nil { log.Errorf("Error marshalling packet for %s: %s", ip.String(), err) return } sendTime := time.Now() var dst *net.IPAddr dst = &net.IPAddr{IP: ip} if _, err := socket.WriteTo(wb, dst); err != nil { log.Errorf("Error writing to socket for %s: %s", ip.String(), err) return } // Reply should be the same except for the message type. if isIPv4(ip) { wm.Type = ipv4.ICMPTypeEchoReply } else if isIPv6(ip) { wm.Type = ipv6.ICMPTypeEchoReply } else { log.Errorln("IP did not match any known types?") return } wb, err = wm.Marshal(nil) if err != nil { log.Errorf("Error marshalling packet for %s: %s", ip.String(), err) return } rb := make([]byte, 1500) if err := socket.SetReadDeadline(deadline); err != nil { log.Errorf("Error setting socket deadline for %s: %s", ip.String(), err) return } for { n, peer, err := socket.ReadFrom(rb) if err != nil { if nerr, ok := err.(net.Error); ok && nerr.Timeout() { log.Infof("Timeout reading from socket for %s: %s", ip.String(), err) return } log.Errorf("Error reading from socket for %s: %s", ip.String(), err) continue } if peer.String() != ip.String() { continue } if bytes.Compare(rb[:n], wb) == 0 { success = true latency = time.Now().Sub(sendTime) return } } return }
func main() { rand.Seed(time.Now().Unix()) flag.Parse() // This is only used when we're running in -dev mode with bindata rootDir, _ = osext.ExecutableFolder() rootDir = path.Join(rootDir, "web") // Parse configuration cfg, err := config.LoadFromFile(*configFile) if err != nil { log.Fatalln("Error loading config", err) } // Templates amberTmpl, err := Asset("templates/index.amber") if err != nil { log.Fatalln("Could not load index template:", err) } tmpl := amber.MustCompile(string(amberTmpl), amber.Options{}) // Setup the web UI router := httprouter.New() router.Handler("GET", *metricsPath, prometheus.Handler()) // Prometheus // Static asset handling router.GET("/static/*filepath", func(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { reqpath := ps.ByName("filepath") realpath := path.Join("static", reqpath) b, err := Asset(realpath) if err != nil { log.Debugln("Could not find asset: ", err) return } else { w.Write(b) } }) var monitoredHosts []*pollers.Host router.GET("/", func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) { data := struct { Cfg *config.Config Hosts *[]*pollers.Host }{ Cfg: cfg, Hosts: &monitoredHosts, } err := tmpl.Execute(w, &data) if err != nil { log.Errorln("Error rendering template", err) } }) // Initialize the host pollers monitoredHosts = make([]*pollers.Host, len(cfg.Hosts)) // We don't allow duplicate hosts, but also don't want to panic just due // to a typo, so keep track and skip duplicates here. seenHosts := make(map[string]bool) realidx := 0 for _, hostCfg := range cfg.Hosts { log.Debugln("Setting up poller for: ", hostCfg.Hostname) if *skipPing { hostCfg.PingDisable = true } if _, ok := seenHosts[hostCfg.Hostname]; ok { log.Warnln("Discarding repeat configuration of same hostname", hostCfg.Hostname) continue } host := pollers.NewHost(hostCfg) monitoredHosts[realidx] = host prometheus.MustRegister(host) seenHosts[hostCfg.Hostname] = true realidx++ } // Trim monitoredHosts to the number we actually used monitoredHosts = monitoredHosts[0:realidx] // This is the dispatcher. It is responsible for invoking the doPoll method // of hosts. connectionLimiter := pollers.NewLimiter(*maxConnections) hostQueue := make(chan *pollers.Host) // Start the host dispatcher go func() { for host := range hostQueue { go host.Poll(connectionLimiter, hostQueue) } }() // Do the initial host dispatch go func() { for _, host := range monitoredHosts { log.Debugln("Starting polling for hosts") hostQueue <- host } }() var handler http.Handler // If basic auth is requested, enable it for the interface. if cfg.BasicAuthUsername != "" && cfg.BasicAuthPassword != "" { basicauth := httpauth.SimpleBasicAuth(cfg.BasicAuthUsername, cfg.BasicAuthPassword) handler = basicauth(router) } else { handler = router } // If TLS certificates are specificed, use TLS if cfg.TLSCertificatePath != "" && cfg.TLSKeyPath != "" { log.Infof("Listening on (TLS-enabled) %s", *listenAddress) err = http.ListenAndServeTLS(*listenAddress, cfg.TLSCertificatePath, cfg.TLSKeyPath, handler) } else { log.Infof("Listening on %s", *listenAddress) err = http.ListenAndServe(*listenAddress, handler) } if err != nil { log.Fatal(err) } }
func main() { flag.Parse() if *showVersion { fmt.Fprintln(os.Stdout, version.Print("pushgateway")) os.Exit(0) } log.Infoln("Starting pushgateway", version.Info()) log.Infoln("Build context", version.BuildContext()) flags := map[string]string{} flag.VisitAll(func(f *flag.Flag) { flags[f.Name] = f.Value.String() }) ms := storage.NewDiskMetricStore(*persistenceFile, *persistenceInterval) prometheus.SetMetricFamilyInjectionHook(ms.GetMetricFamilies) // Enable collect checks for debugging. // prometheus.EnableCollectChecks(true) r := httprouter.New() r.Handler("GET", *metricsPath, prometheus.Handler()) // Handlers for pushing and deleting metrics. r.PUT("/metrics/job/:job/*labels", handler.Push(ms, true)) r.POST("/metrics/job/:job/*labels", handler.Push(ms, false)) r.DELETE("/metrics/job/:job/*labels", handler.Delete(ms)) r.PUT("/metrics/job/:job", handler.Push(ms, true)) r.POST("/metrics/job/:job", handler.Push(ms, false)) r.DELETE("/metrics/job/:job", handler.Delete(ms)) // Handlers for the deprecated API. r.PUT("/metrics/jobs/:job/instances/:instance", handler.LegacyPush(ms, true)) r.POST("/metrics/jobs/:job/instances/:instance", handler.LegacyPush(ms, false)) r.DELETE("/metrics/jobs/:job/instances/:instance", handler.LegacyDelete(ms)) r.PUT("/metrics/jobs/:job", handler.LegacyPush(ms, true)) r.POST("/metrics/jobs/:job", handler.LegacyPush(ms, false)) r.DELETE("/metrics/jobs/:job", handler.LegacyDelete(ms)) r.Handler("GET", "/static/*filepath", prometheus.InstrumentHandler( "static", http.FileServer( &assetfs.AssetFS{Asset: Asset, AssetDir: AssetDir, AssetInfo: AssetInfo}, ), )) statusHandler := prometheus.InstrumentHandlerFunc("status", handler.Status(ms, Asset, flags)) r.Handler("GET", "/status", statusHandler) r.Handler("GET", "/", statusHandler) // Re-enable pprof. r.GET("/debug/pprof/*pprof", handlePprof) log.Infof("Listening on %s.", *listenAddress) l, err := net.Listen("tcp", *listenAddress) if err != nil { log.Fatal(err) } go interruptHandler(l) err = (&http.Server{Addr: *listenAddress, Handler: r}).Serve(l) log.Errorln("HTTP server stopped:", err) // To give running connections a chance to submit their payload, we wait // for 1sec, but we don't want to wait long (e.g. until all connections // are done) to not delay the shutdown. time.Sleep(time.Second) if err := ms.Shutdown(); err != nil { log.Errorln("Problem shutting down metric storage:", err) } }
func (b *Exporter) Listen(e <-chan Events) { for { events, ok := <-e if !ok { log.Debug("Channel is closed. Break out of Exporter.Listener.") return } for _, event := range events { metricName := "" prometheusLabels := event.Labels() labels, present := b.mapper.getMapping(event.MetricName()) if present { metricName = labels["name"] for label, value := range labels { if label != "name" { prometheusLabels[label] = value } } } else { eventsUnmapped.Inc() metricName = escapeMetricName(event.MetricName()) } switch event.(type) { case *CounterEvent: counter := b.Counters.Get( b.suffix(metricName, "counter"), prometheusLabels, ) // We don't accept negative values for counters. Incrementing the counter with a negative number // will cause the exporter to panic. Instead we will warn and continue to the next event. if event.Value() < 0.0 { log.Errorf("Counter %q is: '%f' (counter must be non-negative value)", metricName, event.Value()) continue } counter.Add(event.Value()) eventStats.WithLabelValues("counter").Inc() case *GaugeEvent: gauge := b.Gauges.Get( b.suffix(metricName, "gauge"), prometheusLabels, ) gauge.Set(event.Value()) eventStats.WithLabelValues("gauge").Inc() case *TimerEvent: summary := b.Summaries.Get( b.suffix(metricName, "timer"), prometheusLabels, ) summary.Observe(event.Value()) eventStats.WithLabelValues("timer").Inc() default: log.Errorln("Unsupported event type") eventStats.WithLabelValues("illegal").Inc() } } } }
func (e *Exporter) scrape(ch chan<- prometheus.Metric) { e.totalScrapes.Inc() var err error defer func(begun time.Time) { e.duration.Set(time.Since(begun).Seconds()) if err == nil { e.error.Set(0) } else { e.error.Set(1) } }(time.Now()) db, err := sql.Open("mysql", e.dsn) if err != nil { log.Errorln("Error opening connection to database:", err) return } defer db.Close() isUpRows, err := db.Query(upQuery) if err != nil { log.Errorln("Error pinging mysqld:", err) e.mysqldUp.Set(0) return } isUpRows.Close() e.mysqldUp.Set(1) if *slowLogFilter { sessionSettingsRows, err := db.Query(sessionSettingsQuery) if err != nil { log.Errorln("Error setting log_slow_filter:", err) return } sessionSettingsRows.Close() } if *collectGlobalStatus { if err = collector.ScrapeGlobalStatus(db, ch); err != nil { log.Errorln("Error scraping for collect.global_status:", err) e.scrapeErrors.WithLabelValues("collect.global_status").Inc() } } if *collectGlobalVariables { if err = collector.ScrapeGlobalVariables(db, ch); err != nil { log.Errorln("Error scraping for collect.global_variables:", err) e.scrapeErrors.WithLabelValues("collect.global_variables").Inc() } } if *collectSlaveStatus { if err = collector.ScrapeSlaveStatus(db, ch); err != nil { log.Errorln("Error scraping for collect.slave_status:", err) e.scrapeErrors.WithLabelValues("collect.slave_status").Inc() } } if *collectProcesslist { if err = collector.ScrapeProcesslist(db, ch); err != nil { log.Errorln("Error scraping for collect.info_schema.processlist:", err) e.scrapeErrors.WithLabelValues("collect.info_schema.processlist").Inc() } } if *collectTableSchema { if err = collector.ScrapeTableSchema(db, ch); err != nil { log.Errorln("Error scraping for collect.info_schema.tables:", err) e.scrapeErrors.WithLabelValues("collect.info_schema.tables").Inc() } } if *collectInnodbTablespaces { if err = collector.ScrapeInfoSchemaInnodbTablespaces(db, ch); err != nil { log.Errorln("Error scraping for collect.info_schema.innodb_sys_tablespaces:", err) e.scrapeErrors.WithLabelValues("collect.info_schema.innodb_sys_tablespaces").Inc() } } if *innodbMetrics { if err = collector.ScrapeInnodbMetrics(db, ch); err != nil { log.Errorln("Error scraping for collect.info_schema.innodb_metrics:", err) e.scrapeErrors.WithLabelValues("collect.info_schema.innodb_metrics").Inc() } } if *collectAutoIncrementColumns { if err = collector.ScrapeAutoIncrementColumns(db, ch); err != nil { log.Errorln("Error scraping for collect.auto_increment.columns:", err) e.scrapeErrors.WithLabelValues("collect.auto_increment.columns").Inc() } } if *collectBinlogSize { if err = collector.ScrapeBinlogSize(db, ch); err != nil { log.Errorln("Error scraping for collect.binlog_size:", err) e.scrapeErrors.WithLabelValues("collect.binlog_size").Inc() } } if *collectPerfTableIOWaits { if err = collector.ScrapePerfTableIOWaits(db, ch); err != nil { log.Errorln("Error scraping for collect.perf_schema.tableiowaits:", err) e.scrapeErrors.WithLabelValues("collect.perf_schema.tableiowaits").Inc() } } if *collectPerfIndexIOWaits { if err = collector.ScrapePerfIndexIOWaits(db, ch); err != nil { log.Errorln("Error scraping for collect.perf_schema.indexiowaits:", err) e.scrapeErrors.WithLabelValues("collect.perf_schema.indexiowaits").Inc() } } if *collectPerfTableLockWaits { if err = collector.ScrapePerfTableLockWaits(db, ch); err != nil { log.Errorln("Error scraping for collect.perf_schema.tablelocks:", err) e.scrapeErrors.WithLabelValues("collect.perf_schema.tablelocks").Inc() } } if *collectPerfEventsStatements { if err = collector.ScrapePerfEventsStatements(db, ch); err != nil { log.Errorln("Error scraping for collect.perf_schema.eventsstatements:", err) e.scrapeErrors.WithLabelValues("collect.perf_schema.eventsstatements").Inc() } } if *collectPerfEventsWaits { if err = collector.ScrapePerfEventsWaits(db, ch); err != nil { log.Errorln("Error scraping for collect.perf_schema.eventswaits:", err) e.scrapeErrors.WithLabelValues("collect.perf_schema.eventswaits").Inc() } } if *collectPerfFileEvents { if err = collector.ScrapePerfFileEvents(db, ch); err != nil { log.Errorln("Error scraping for collect.perf_schema.file_events:", err) e.scrapeErrors.WithLabelValues("collect.perf_schema.file_events").Inc() } } if *collectUserStat { if err = collector.ScrapeUserStat(db, ch); err != nil { log.Errorln("Error scraping for collect.info_schema.userstats:", err) e.scrapeErrors.WithLabelValues("collect.info_schema.userstats").Inc() } } if *collectClientStat { if err = collector.ScrapeClientStat(db, ch); err != nil { log.Errorln("Error scraping for collect.info_schema.clientstats:", err) e.scrapeErrors.WithLabelValues("collect.info_schema.clientstats").Inc() } } if *collectTableStat { if err = collector.ScrapeTableStat(db, ch); err != nil { log.Errorln("Error scraping for collect.info_schema.tablestats:", err) e.scrapeErrors.WithLabelValues("collect.info_schema.tablestats").Inc() } } if *collectQueryResponseTime { if err = collector.ScrapeQueryResponseTime(db, ch); err != nil { log.Errorln("Error scraping for collect.info_schema.query_response_time:", err) e.scrapeErrors.WithLabelValues("collect.info_schema.query_response_time").Inc() } } if *collectEngineTokudbStatus { if err = collector.ScrapeEngineTokudbStatus(db, ch); err != nil { log.Errorln("Error scraping for collect.engine_tokudb_status:", err) e.scrapeErrors.WithLabelValues("collect.engine_tokudb_status").Inc() } } if *collectEngineInnodbStatus { if err = collector.ScrapeEngineInnodbStatus(db, ch); err != nil { log.Errorln("Error scraping for collect.engine_innodb_status:", err) e.scrapeErrors.WithLabelValues("collect.engine_innodb_status").Inc() } } }