Example #1
0
func watchConfig(fileName string, mapper *metricMapper) {
	watcher, err := fsnotify.NewWatcher()
	if err != nil {
		log.Fatal(err)
	}

	err = watcher.WatchFlags(fileName, fsnotify.FSN_MODIFY)
	if err != nil {
		log.Fatal(err)
	}

	for {
		select {
		case ev := <-watcher.Event:
			log.Infof("Config file changed (%s), attempting reload", ev)
			err = mapper.initFromFile(fileName)
			if err != nil {
				log.Errorln("Error reloading config:", err)
				configLoads.WithLabelValues("failure").Inc()
			} else {
				log.Infoln("Config reloaded successfully")
				configLoads.WithLabelValues("success").Inc()
			}
			// Re-add the file watcher since it can get lost on some changes. E.g.
			// saving a file with vim results in a RENAME-MODIFY-DELETE event
			// sequence, after which the newly written file is no longer watched.
			err = watcher.WatchFlags(fileName, fsnotify.FSN_MODIFY)
		case err := <-watcher.Error:
			log.Errorln("Error watching config:", err)
		}
	}
}
Example #2
0
func (s *memorySeriesStorage) loop() {
	checkpointTimer := time.NewTimer(s.checkpointInterval)

	dirtySeriesCount := 0

	defer func() {
		checkpointTimer.Stop()
		log.Info("Maintenance loop stopped.")
		close(s.loopStopped)
	}()

	memoryFingerprints := s.cycleThroughMemoryFingerprints()
	archivedFingerprints := s.cycleThroughArchivedFingerprints()

loop:
	for {
		select {
		case <-s.loopStopping:
			break loop
		case <-checkpointTimer.C:
			err := s.persistence.checkpointSeriesMapAndHeads(s.fpToSeries, s.fpLocker)
			if err != nil {
				log.Errorln("Error while checkpointing:", err)
			} else {
				dirtySeriesCount = 0
			}
			// If a checkpoint takes longer than checkpointInterval, unluckily timed
			// combination with the Reset(0) call below can lead to a case where a
			// time is lurking in C leading to repeated checkpointing without break.
			select {
			case <-checkpointTimer.C: // Get rid of the lurking time.
			default:
			}
			checkpointTimer.Reset(s.checkpointInterval)
		case fp := <-memoryFingerprints:
			if s.maintainMemorySeries(fp, model.Now().Add(-s.dropAfter)) {
				dirtySeriesCount++
				// Check if we have enough "dirty" series so that we need an early checkpoint.
				// However, if we are already behind persisting chunks, creating a checkpoint
				// would be counterproductive, as it would slow down chunk persisting even more,
				// while in a situation like that, where we are clearly lacking speed of disk
				// maintenance, the best we can do for crash recovery is to persist chunks as
				// quickly as possible. So only checkpoint if the urgency score is < 1.
				if dirtySeriesCount >= s.checkpointDirtySeriesLimit &&
					s.calculatePersistenceUrgencyScore() < 1 {
					checkpointTimer.Reset(0)
				}
			}
		case fp := <-archivedFingerprints:
			s.maintainArchivedSeries(fp, model.Now().Add(-s.dropAfter))
		}
	}
	// Wait until both channels are closed.
	for range memoryFingerprints {
	}
	for range archivedFingerprints {
	}
}
func (dms *DiskMetricStore) loop(persistenceInterval time.Duration) {
	lastPersist := time.Now()
	persistScheduled := false
	lastWrite := time.Time{}
	persistDone := make(chan time.Time)
	var persistTimer *time.Timer

	checkPersist := func() {
		if !persistScheduled && lastWrite.After(lastPersist) {
			persistTimer = time.AfterFunc(
				persistenceInterval-lastWrite.Sub(lastPersist),
				func() {
					persistStarted := time.Now()
					if err := dms.persist(); err != nil {
						log.Errorln("Error persisting metrics:", err)
					} else {
						log.Infof(
							"Metrics persisted to '%s'.",
							dms.persistenceFile,
						)
					}
					persistDone <- persistStarted
				},
			)
			persistScheduled = true
		}
	}

	for {
		select {
		case wr := <-dms.writeQueue:
			dms.processWriteRequest(wr)
			lastWrite = time.Now()
			checkPersist()
		case lastPersist = <-persistDone:
			persistScheduled = false
			checkPersist() // In case something has been written in the meantime.
		case <-dms.drain:
			// Prevent a scheduled persist from firing later.
			if persistTimer != nil {
				persistTimer.Stop()
			}
			// Now draining...
			for {
				select {
				case wr := <-dms.writeQueue:
					dms.processWriteRequest(wr)
				default:
					dms.done <- dms.persist()
					return
				}
			}
		}
	}
}
Example #4
0
// Start implements Storage.
func (s *memorySeriesStorage) Start() (err error) {
	var syncStrategy syncStrategy
	switch s.options.SyncStrategy {
	case Never:
		syncStrategy = func() bool { return false }
	case Always:
		syncStrategy = func() bool { return true }
	case Adaptive:
		syncStrategy = func() bool { return s.calculatePersistenceUrgencyScore() < 1 }
	default:
		panic("unknown sync strategy")
	}

	var p *persistence
	p, err = newPersistence(
		s.options.PersistenceStoragePath,
		s.options.Dirty, s.options.PedanticChecks,
		syncStrategy,
		s.options.MinShrinkRatio,
	)
	if err != nil {
		return err
	}
	s.persistence = p
	// Persistence must start running before loadSeriesMapAndHeads() is called.
	go s.persistence.run()

	defer func() {
		if err != nil {
			if e := p.close(); e != nil {
				log.Errorln("Error closing persistence:", e)
			}
		}
	}()

	log.Info("Loading series map and head chunks...")
	s.fpToSeries, s.numChunksToPersist, err = p.loadSeriesMapAndHeads()
	if err != nil {
		return err
	}
	log.Infof("%d series loaded.", s.fpToSeries.length())
	s.numSeries.Set(float64(s.fpToSeries.length()))

	s.mapper, err = newFPMapper(s.fpToSeries, p)
	if err != nil {
		return err
	}

	go s.handleEvictList()
	go s.handleQuarantine()
	go s.logThrottling()
	go s.loop()

	return nil
}
// NewDiskMetricStore returns a DiskMetricStore ready to use. To cleanly shut it
// down and free resources, the Shutdown() method has to be called.  If
// persistenceFile is the empty string, no persisting to disk will
// happen. Otherwise, a file of that name is used for persisting metrics to
// disk. If the file already exists, metrics are read from it as part of the
// start-up. Persisting is happening upon shutdown and after every write action,
// but the latter will only happen persistenceDuration after the previous
// persisting.
func NewDiskMetricStore(
	persistenceFile string,
	persistenceInterval time.Duration,
) *DiskMetricStore {
	dms := &DiskMetricStore{
		writeQueue:      make(chan WriteRequest, writeQueueCapacity),
		drain:           make(chan struct{}),
		done:            make(chan error),
		metricGroups:    GroupingKeyToMetricGroup{},
		persistenceFile: persistenceFile,
	}
	if err := dms.restore(); err != nil {
		log.Errorln("Could not load persisted metrics:", err)
		log.Info("Retrying assuming legacy format for persisted metrics...")
		if err := dms.legacyRestore(); err != nil {
			log.Errorln("Could not load persisted metrics in legacy format: ", err)
		}
	}

	go dms.loop(persistenceInterval)
	return dms
}
Example #6
0
func (s *memorySeriesStorage) loop() {
	checkpointTimer := time.NewTimer(s.checkpointInterval)

	dirtySeriesCount := 0

	defer func() {
		checkpointTimer.Stop()
		log.Info("Maintenance loop stopped.")
		close(s.loopStopped)
	}()

	memoryFingerprints := s.cycleThroughMemoryFingerprints()
	archivedFingerprints := s.cycleThroughArchivedFingerprints()

loop:
	for {
		select {
		case <-s.loopStopping:
			break loop
		case <-checkpointTimer.C:
			err := s.persistence.checkpointSeriesMapAndHeads(s.fpToSeries, s.fpLocker)
			if err != nil {
				log.Errorln("Error while checkpointing:", err)
			} else {
				dirtySeriesCount = 0
			}
			checkpointTimer.Reset(s.checkpointInterval)
		case fp := <-memoryFingerprints:
			if s.maintainMemorySeries(fp, model.Now().Add(-s.dropAfter)) {
				dirtySeriesCount++
				// Check if we have enough "dirty" series so that we need an early checkpoint.
				// However, if we are already behind persisting chunks, creating a checkpoint
				// would be counterproductive, as it would slow down chunk persisting even more,
				// while in a situation like that, where we are clearly lacking speed of disk
				// maintenance, the best we can do for crash recovery is to persist chunks as
				// quickly as possible. So only checkpoint if the storage is not in "graceful
				// degradation mode".
				if dirtySeriesCount >= s.checkpointDirtySeriesLimit && !s.isDegraded() {
					checkpointTimer.Reset(0)
				}
			}
		case fp := <-archivedFingerprints:
			s.maintainArchivedSeries(fp, model.Now().Add(-s.dropAfter))
		}
	}
	// Wait until both channels are closed.
	for range memoryFingerprints {
	}
	for range archivedFingerprints {
	}
}
Example #7
0
func (d *Device) recvResponse() {
	var n int
	buf := make([]byte, 4)
	for resp := range d.respChan {
		err := d.completeCommand(resp)
		if err != nil {
			log.Errorf("error completing command: %s", err)
			return
		}
		/* Tell the fd there's something new */
		n, err = unix.Write(d.uioFd, buf)
		if n == -1 && err != nil {
			log.Errorln("poll write")
			return
		}
	}
}
// Check and update the exporters query maps if the version has changed.
func (e *Exporter) checkMapVersions(ch chan<- prometheus.Metric, db *sql.DB) error {
	log.Debugln("Querying Postgres Version")
	versionRow := db.QueryRow("SELECT version();")
	var versionString string
	err := versionRow.Scan(&versionString)
	if err != nil {
		return errors.New(fmt.Sprintln("Error scanning version string:", err))
	}
	semanticVersion, err := parseVersion(versionString)

	// Check if semantic version changed and recalculate maps if needed.
	if semanticVersion.NE(e.lastMapVersion) || e.variableMap == nil || e.metricMap == nil {
		log.Infoln("Semantic Version Changed:", e.lastMapVersion.String(), "->", semanticVersion.String())
		e.mappingMtx.Lock()

		e.variableMap = makeDescMap(semanticVersion, variableMaps)
		e.metricMap = makeDescMap(semanticVersion, metricMaps)
		e.queryOverrides = makeQueryOverrideMap(semanticVersion, queryOverrides)
		e.lastMapVersion = semanticVersion

		if e.userQueriesPath != "" {
			if err := addQueries(e.userQueriesPath, semanticVersion, e.metricMap, e.queryOverrides); err != nil {
				log.Errorln("Failed to reload user queries:", e.userQueriesPath, err)
			}
		}

		e.mappingMtx.Unlock()
	}

	// Output the version as a special metric
	versionDesc := prometheus.NewDesc(fmt.Sprintf("%s_%s", namespace, staticLabelName),
		"Version string as reported by postgres", []string{"version", "short_version"}, nil)

	ch <- prometheus.MustNewConstMetric(versionDesc,
		prometheus.UntypedValue, 1, versionString, semanticVersion.String())
	return nil
}
Example #9
0
// Main manages the startup and shutdown lifecycle of the entire Prometheus server.
func Main() int {
	if err := parse(os.Args[1:]); err != nil {
		return 2
	}

	printVersion()
	if cfg.printVersion {
		return 0
	}

	var reloadables []Reloadable

	var (
		memStorage     = local.NewMemorySeriesStorage(&cfg.storage)
		remoteStorage  = remote.New(&cfg.remote)
		sampleAppender = storage.Fanout{memStorage}
	)
	if remoteStorage != nil {
		sampleAppender = append(sampleAppender, remoteStorage)
		reloadables = append(reloadables, remoteStorage)
	}

	var (
		notificationHandler = notification.New(&cfg.notification)
		targetManager       = retrieval.NewTargetManager(sampleAppender)
		queryEngine         = promql.NewEngine(memStorage, &cfg.queryEngine)
	)

	ruleManager := rules.NewManager(&rules.ManagerOptions{
		SampleAppender:      sampleAppender,
		NotificationHandler: notificationHandler,
		QueryEngine:         queryEngine,
		ExternalURL:         cfg.web.ExternalURL,
	})

	flags := map[string]string{}
	cfg.fs.VisitAll(func(f *flag.Flag) {
		flags[f.Name] = f.Value.String()
	})

	status := &web.PrometheusStatus{
		TargetPools: targetManager.Pools,
		Rules:       ruleManager.Rules,
		Flags:       flags,
		Birth:       time.Now(),
	}

	webHandler := web.New(memStorage, queryEngine, ruleManager, status, &cfg.web)

	reloadables = append(reloadables, status, targetManager, ruleManager, webHandler, notificationHandler)

	if !reloadConfig(cfg.configFile, reloadables...) {
		return 1
	}

	// Wait for reload or termination signals. Start the handler for SIGHUP as
	// early as possible, but ignore it until we are ready to handle reloading
	// our config.
	hup := make(chan os.Signal)
	hupReady := make(chan bool)
	signal.Notify(hup, syscall.SIGHUP)
	go func() {
		<-hupReady
		for {
			select {
			case <-hup:
			case <-webHandler.Reload():
			}
			reloadConfig(cfg.configFile, reloadables...)
		}
	}()

	// Start all components.
	if err := memStorage.Start(); err != nil {
		log.Errorln("Error opening memory series storage:", err)
		return 1
	}
	defer func() {
		if err := memStorage.Stop(); err != nil {
			log.Errorln("Error stopping storage:", err)
		}
	}()

	if remoteStorage != nil {
		prometheus.MustRegister(remoteStorage)

		go remoteStorage.Run()
		defer remoteStorage.Stop()
	}
	// The storage has to be fully initialized before registering.
	prometheus.MustRegister(memStorage)
	prometheus.MustRegister(notificationHandler)
	prometheus.MustRegister(configSuccess)
	prometheus.MustRegister(configSuccessTime)

	defer ruleManager.Stop()

	go notificationHandler.Run()
	defer notificationHandler.Stop()

	go targetManager.Run()
	defer targetManager.Stop()

	defer queryEngine.Stop()

	go webHandler.Run()

	// Wait for reload or termination signals.
	close(hupReady) // Unblock SIGHUP handler.

	term := make(chan os.Signal)
	signal.Notify(term, os.Interrupt, syscall.SIGTERM)
	select {
	case <-term:
		log.Warn("Received SIGTERM, exiting gracefully...")
	case <-webHandler.Quit():
		log.Warn("Received termination request via web service, exiting gracefully...")
	case err := <-webHandler.ListenError():
		log.Errorln("Error starting web server, exiting gracefully:", err)
	}

	log.Info("See you next time!")
	return 0
}
Example #10
0
// Main manages the startup and shutdown lifecycle of the entire Prometheus server.
func Main() int {
	if err := parse(os.Args[1:]); err != nil {
		log.Error(err)
		return 2
	}

	if cfg.printVersion {
		fmt.Fprintln(os.Stdout, version.Print("prometheus"))
		return 0
	}

	log.Infoln("Starting prometheus", version.Info())
	log.Infoln("Build context", version.BuildContext())

	var reloadables []Reloadable

	var (
		memStorage     = local.NewMemorySeriesStorage(&cfg.storage)
		remoteStorage  = remote.New(&cfg.remote)
		sampleAppender = storage.Fanout{memStorage}
	)
	if remoteStorage != nil {
		sampleAppender = append(sampleAppender, remoteStorage)
		reloadables = append(reloadables, remoteStorage)
	}

	var (
		notifier      = notifier.New(&cfg.notifier)
		targetManager = retrieval.NewTargetManager(sampleAppender)
		queryEngine   = promql.NewEngine(memStorage, &cfg.queryEngine)
	)

	ruleManager := rules.NewManager(&rules.ManagerOptions{
		SampleAppender: sampleAppender,
		Notifier:       notifier,
		QueryEngine:    queryEngine,
		ExternalURL:    cfg.web.ExternalURL,
	})

	flags := map[string]string{}
	cfg.fs.VisitAll(func(f *flag.Flag) {
		flags[f.Name] = f.Value.String()
	})

	version := &web.PrometheusVersion{
		Version:   version.Version,
		Revision:  version.Revision,
		Branch:    version.Branch,
		BuildUser: version.BuildUser,
		BuildDate: version.BuildDate,
		GoVersion: version.GoVersion,
	}

	webHandler := web.New(memStorage, queryEngine, targetManager, ruleManager, version, flags, &cfg.web)

	reloadables = append(reloadables, targetManager, ruleManager, webHandler, notifier)

	if !reloadConfig(cfg.configFile, reloadables...) {
		return 1
	}

	// Wait for reload or termination signals. Start the handler for SIGHUP as
	// early as possible, but ignore it until we are ready to handle reloading
	// our config.
	hup := make(chan os.Signal)
	hupReady := make(chan bool)
	signal.Notify(hup, syscall.SIGHUP)
	go func() {
		<-hupReady
		for {
			select {
			case <-hup:
			case <-webHandler.Reload():
			}
			reloadConfig(cfg.configFile, reloadables...)
		}
	}()

	// Start all components. The order is NOT arbitrary.

	if err := memStorage.Start(); err != nil {
		log.Errorln("Error opening memory series storage:", err)
		return 1
	}
	defer func() {
		if err := memStorage.Stop(); err != nil {
			log.Errorln("Error stopping storage:", err)
		}
	}()

	if remoteStorage != nil {
		prometheus.MustRegister(remoteStorage)

		go remoteStorage.Run()
		defer remoteStorage.Stop()
	}
	// The storage has to be fully initialized before registering.
	prometheus.MustRegister(memStorage)
	prometheus.MustRegister(notifier)
	prometheus.MustRegister(configSuccess)
	prometheus.MustRegister(configSuccessTime)

	// The notifieris a dependency of the rule manager. It has to be
	// started before and torn down afterwards.
	go notifier.Run()
	defer notifier.Stop()

	go ruleManager.Run()
	defer ruleManager.Stop()

	go targetManager.Run()
	defer targetManager.Stop()

	// Shutting down the query engine before the rule manager will cause pending queries
	// to be canceled and ensures a quick shutdown of the rule manager.
	defer queryEngine.Stop()

	go webHandler.Run()

	// Wait for reload or termination signals.
	close(hupReady) // Unblock SIGHUP handler.

	term := make(chan os.Signal)
	signal.Notify(term, os.Interrupt, syscall.SIGTERM)
	select {
	case <-term:
		log.Warn("Received SIGTERM, exiting gracefully...")
	case <-webHandler.Quit():
		log.Warn("Received termination request via web service, exiting gracefully...")
	case err := <-webHandler.ListenError():
		log.Errorln("Error starting web server, exiting gracefully:", err)
	}

	log.Info("See you next time!")
	return 0
}
Example #11
0
func (l *StatsDListener) handlePacket(packet []byte, e chan<- Events) {
	lines := strings.Split(string(packet), "\n")
	events := Events{}
	for _, line := range lines {
		if line == "" {
			continue
		}

		elements := strings.SplitN(line, ":", 2)
		if len(elements) < 2 || len(elements[0]) == 0 || !utf8.ValidString(line) {
			networkStats.WithLabelValues("malformed_line").Inc()
			log.Errorln("Bad line from StatsD:", line)
			continue
		}
		metric := elements[0]
		var samples []string
		if strings.Contains(elements[1], "|#") {
			// using datadog extensions, disable multi-metrics
			samples = elements[1:]
		} else {
			samples = strings.Split(elements[1], ":")
		}
	samples:
		for _, sample := range samples {
			components := strings.Split(sample, "|")
			samplingFactor := 1.0
			if len(components) < 2 || len(components) > 4 {
				networkStats.WithLabelValues("malformed_component").Inc()
				log.Errorln("Bad component on line:", line)
				continue
			}
			valueStr, statType := components[0], components[1]
			value, err := strconv.ParseFloat(valueStr, 64)
			if err != nil {
				log.Errorf("Bad value %s on line: %s", valueStr, line)
				networkStats.WithLabelValues("malformed_value").Inc()
				continue
			}

			labels := map[string]string{}
			if len(components) >= 3 {
				for _, component := range components[2:] {
					if len(component) == 0 {
						log.Errorln("Empty component on line: ", line)
						networkStats.WithLabelValues("malformed_component").Inc()
						continue samples
					}
				}

				for _, component := range components[2:] {
					switch component[0] {
					case '@':
						if statType != "c" {
							log.Errorln("Illegal sampling factor for non-counter metric on line", line)
							networkStats.WithLabelValues("illegal_sample_factor").Inc()
						}
						samplingFactor, err = strconv.ParseFloat(component[1:], 64)
						if err != nil {
							log.Errorf("Invalid sampling factor %s on line %s", component[1:], line)
							networkStats.WithLabelValues("invalid_sample_factor").Inc()
						}
						if samplingFactor == 0 {
							samplingFactor = 1
						}
						value /= samplingFactor
					case '#':
						labels = parseDogStatsDTagsToLabels(component)
					default:
						log.Errorf("Invalid sampling factor or tag section %s on line %s", components[2], line)
						networkStats.WithLabelValues("invalid_sample_factor").Inc()
						continue
					}
				}
			}

			event, err := buildEvent(statType, metric, value, labels)
			if err != nil {
				log.Errorf("Error building event on line %s: %s", line, err)
				networkStats.WithLabelValues("illegal_event").Inc()
				continue
			}
			events = append(events, event)
			networkStats.WithLabelValues("legal").Inc()
		}
	}
	e <- events
}
Example #12
0
// Main manages the startup and shutdown lifecycle of the entire Prometheus server.
func Main() int {
	if err := parse(os.Args[1:]); err != nil {
		log.Error(err)
		return 2
	}

	if cfg.printVersion {
		fmt.Fprintln(os.Stdout, version.Print("prometheus"))
		return 0
	}

	log.Infoln("Starting prometheus", version.Info())
	log.Infoln("Build context", version.BuildContext())

	var (
		sampleAppender = storage.Fanout{}
		reloadables    []Reloadable
	)

	var localStorage local.Storage
	switch cfg.localStorageEngine {
	case "persisted":
		localStorage = local.NewMemorySeriesStorage(&cfg.storage)
		sampleAppender = storage.Fanout{localStorage}
	case "none":
		localStorage = &local.NoopStorage{}
	default:
		log.Errorf("Invalid local storage engine %q", cfg.localStorageEngine)
		return 1
	}

	remoteStorage, err := remote.New(&cfg.remote)
	if err != nil {
		log.Errorf("Error initializing remote storage: %s", err)
		return 1
	}
	if remoteStorage != nil {
		sampleAppender = append(sampleAppender, remoteStorage)
		reloadables = append(reloadables, remoteStorage)
	}

	reloadableRemoteStorage := remote.NewConfigurable()
	sampleAppender = append(sampleAppender, reloadableRemoteStorage)
	reloadables = append(reloadables, reloadableRemoteStorage)

	var (
		notifier       = notifier.New(&cfg.notifier)
		targetManager  = retrieval.NewTargetManager(sampleAppender)
		queryEngine    = promql.NewEngine(localStorage, &cfg.queryEngine)
		ctx, cancelCtx = context.WithCancel(context.Background())
	)

	ruleManager := rules.NewManager(&rules.ManagerOptions{
		SampleAppender: sampleAppender,
		Notifier:       notifier,
		QueryEngine:    queryEngine,
		Context:        ctx,
		ExternalURL:    cfg.web.ExternalURL,
	})

	cfg.web.Context = ctx
	cfg.web.Storage = localStorage
	cfg.web.QueryEngine = queryEngine
	cfg.web.TargetManager = targetManager
	cfg.web.RuleManager = ruleManager

	cfg.web.Version = &web.PrometheusVersion{
		Version:   version.Version,
		Revision:  version.Revision,
		Branch:    version.Branch,
		BuildUser: version.BuildUser,
		BuildDate: version.BuildDate,
		GoVersion: version.GoVersion,
	}

	cfg.web.Flags = map[string]string{}
	cfg.fs.VisitAll(func(f *flag.Flag) {
		cfg.web.Flags[f.Name] = f.Value.String()
	})

	webHandler := web.New(&cfg.web)

	reloadables = append(reloadables, targetManager, ruleManager, webHandler, notifier)

	if err := reloadConfig(cfg.configFile, reloadables...); err != nil {
		log.Errorf("Error loading config: %s", err)
		return 1
	}

	// Wait for reload or termination signals. Start the handler for SIGHUP as
	// early as possible, but ignore it until we are ready to handle reloading
	// our config.
	hup := make(chan os.Signal)
	hupReady := make(chan bool)
	signal.Notify(hup, syscall.SIGHUP)
	go func() {
		<-hupReady
		for {
			select {
			case <-hup:
				if err := reloadConfig(cfg.configFile, reloadables...); err != nil {
					log.Errorf("Error reloading config: %s", err)
				}
			case rc := <-webHandler.Reload():
				if err := reloadConfig(cfg.configFile, reloadables...); err != nil {
					log.Errorf("Error reloading config: %s", err)
					rc <- err
				} else {
					rc <- nil
				}
			}
		}
	}()

	// Start all components. The order is NOT arbitrary.

	if err := localStorage.Start(); err != nil {
		log.Errorln("Error opening memory series storage:", err)
		return 1
	}
	defer func() {
		if err := localStorage.Stop(); err != nil {
			log.Errorln("Error stopping storage:", err)
		}
	}()

	if remoteStorage != nil {
		remoteStorage.Start()
		defer remoteStorage.Stop()
	}

	defer reloadableRemoteStorage.Stop()

	// The storage has to be fully initialized before registering.
	if instrumentedStorage, ok := localStorage.(prometheus.Collector); ok {
		prometheus.MustRegister(instrumentedStorage)
	}
	prometheus.MustRegister(notifier)
	prometheus.MustRegister(configSuccess)
	prometheus.MustRegister(configSuccessTime)

	// The notifier is a dependency of the rule manager. It has to be
	// started before and torn down afterwards.
	go notifier.Run()
	defer notifier.Stop()

	go ruleManager.Run()
	defer ruleManager.Stop()

	go targetManager.Run()
	defer targetManager.Stop()

	// Shutting down the query engine before the rule manager will cause pending queries
	// to be canceled and ensures a quick shutdown of the rule manager.
	defer cancelCtx()

	go webHandler.Run()

	// Wait for reload or termination signals.
	close(hupReady) // Unblock SIGHUP handler.

	term := make(chan os.Signal)
	signal.Notify(term, os.Interrupt, syscall.SIGTERM)
	select {
	case <-term:
		log.Warn("Received SIGTERM, exiting gracefully...")
	case <-webHandler.Quit():
		log.Warn("Received termination request via web service, exiting gracefully...")
	case err := <-webHandler.ListenError():
		log.Errorln("Error starting web server, exiting gracefully:", err)
	}

	log.Info("See you next time!")
	return 0
}
// Turn the MetricMap column mapping into a prometheus descriptor mapping.
func makeDescMap(pgVersion semver.Version, metricMaps map[string]map[string]ColumnMapping) map[string]MetricMapNamespace {
	var metricMap = make(map[string]MetricMapNamespace)

	for namespace, mappings := range metricMaps {
		thisMap := make(map[string]MetricMap)

		// Get the constant labels
		var constLabels []string
		for columnName, columnMapping := range mappings {
			if columnMapping.usage == LABEL {
				constLabels = append(constLabels, columnName)
			}
		}

		for columnName, columnMapping := range mappings {
			// Check column version compatibility for the current map
			// Force to discard if not compatible.
			if columnMapping.supportedVersions != nil {
				if !columnMapping.supportedVersions(pgVersion) {
					// It's very useful to be able to see what columns are being
					// rejected.
					log.Debugln(columnName, "is being forced to discard due to version incompatibility.")
					thisMap[columnName] = MetricMap{
						discard: true,
						conversion: func(in interface{}) (float64, bool) {
							return math.NaN(), true
						},
					}
					continue
				}
			}

			// Determine how to convert the column based on its usage.
			switch columnMapping.usage {
			case DISCARD, LABEL:
				thisMap[columnName] = MetricMap{
					discard: true,
					conversion: func(in interface{}) (float64, bool) {
						return math.NaN(), true
					},
				}
			case COUNTER:
				thisMap[columnName] = MetricMap{
					vtype: prometheus.CounterValue,
					desc:  prometheus.NewDesc(fmt.Sprintf("%s_%s", namespace, columnName), columnMapping.description, constLabels, nil),
					conversion: func(in interface{}) (float64, bool) {
						return dbToFloat64(in)
					},
				}
			case GAUGE:
				thisMap[columnName] = MetricMap{
					vtype: prometheus.GaugeValue,
					desc:  prometheus.NewDesc(fmt.Sprintf("%s_%s", namespace, columnName), columnMapping.description, constLabels, nil),
					conversion: func(in interface{}) (float64, bool) {
						return dbToFloat64(in)
					},
				}
			case MAPPEDMETRIC:
				thisMap[columnName] = MetricMap{
					vtype: prometheus.GaugeValue,
					desc:  prometheus.NewDesc(fmt.Sprintf("%s_%s", namespace, columnName), columnMapping.description, constLabels, nil),
					conversion: func(in interface{}) (float64, bool) {
						text, ok := in.(string)
						if !ok {
							return math.NaN(), false
						}

						val, ok := columnMapping.mapping[text]
						if !ok {
							return math.NaN(), false
						}
						return val, true
					},
				}
			case DURATION:
				thisMap[columnName] = MetricMap{
					vtype: prometheus.GaugeValue,
					desc:  prometheus.NewDesc(fmt.Sprintf("%s_%s_milliseconds", namespace, columnName), columnMapping.description, constLabels, nil),
					conversion: func(in interface{}) (float64, bool) {
						var durationString string
						switch t := in.(type) {
						case []byte:
							durationString = string(t)
						case string:
							durationString = t
						default:
							log.Errorln("DURATION conversion metric was not a string")
							return math.NaN(), false
						}

						if durationString == "-1" {
							return math.NaN(), false
						}

						d, err := time.ParseDuration(durationString)
						if err != nil {
							log.Errorln("Failed converting result to metric:", columnName, in, err)
							return math.NaN(), false
						}
						return float64(d / time.Millisecond), true
					},
				}
			}
		}

		metricMap[namespace] = MetricMapNamespace{constLabels, thisMap}
	}

	return metricMap
}
Example #14
0
// Sends a single ICMP echo to an IP and returns success and latency information.
// Borrowed from BrianBrazil's blackbox exporter
func Ping(ip net.IP, maxRTT time.Duration) (success bool, latency time.Duration) {
	deadline := time.Now().Add(maxRTT)

	var socket *icmp.PacketConn
	var err error
	if isIPv4(ip) {
		socket, err = icmp.ListenPacket("ip4:icmp", "0.0.0.0")
	} else if isIPv6(ip) {
		socket, err = icmp.ListenPacket("ip6:ipv6-icmp", "::")
	} else {
		log.Errorln("IP did not match any known types?")
		return
	}

	if err != nil {
		log.Errorf("Error listening to socket: %s", err)
		return
	}
	defer socket.Close()

	seq := getICMPSequence()
	pid := os.Getpid() & 0xffff

	// Build the packet
	var wm icmp.Message
	if isIPv4(ip) {
		wm = icmp.Message{
			Type: ipv4.ICMPTypeEcho, Code: 0,
			Body: &icmp.Echo{
				ID: pid, Seq: int(seq),
				Data: []byte("poller_exporter"),
			},
		}
	} else if isIPv6(ip) {
		wm = icmp.Message{
			Type: ipv6.ICMPTypeEchoRequest, Code: 0,
			Body: &icmp.Echo{
				ID: pid, Seq: int(seq),
				Data: []byte("poller_exporter"),
			},
		}
	} else {
		log.Errorln("IP did not match any known types?")
		return
	}

	wb, err := wm.Marshal(nil)
	if err != nil {
		log.Errorf("Error marshalling packet for %s: %s", ip.String(), err)
		return
	}

	sendTime := time.Now()

	var dst *net.IPAddr
	dst = &net.IPAddr{IP: ip}

	if _, err := socket.WriteTo(wb, dst); err != nil {
		log.Errorf("Error writing to socket for %s: %s", ip.String(), err)
		return
	}

	// Reply should be the same except for the message type.
	if isIPv4(ip) {
		wm.Type = ipv4.ICMPTypeEchoReply
	} else if isIPv6(ip) {
		wm.Type = ipv6.ICMPTypeEchoReply
	} else {
		log.Errorln("IP did not match any known types?")
		return
	}

	wb, err = wm.Marshal(nil)
	if err != nil {
		log.Errorf("Error marshalling packet for %s: %s", ip.String(), err)
		return
	}

	rb := make([]byte, 1500)
	if err := socket.SetReadDeadline(deadline); err != nil {
		log.Errorf("Error setting socket deadline for %s: %s", ip.String(), err)
		return
	}
	for {
		n, peer, err := socket.ReadFrom(rb)
		if err != nil {
			if nerr, ok := err.(net.Error); ok && nerr.Timeout() {
				log.Infof("Timeout reading from socket for %s: %s", ip.String(), err)
				return
			}
			log.Errorf("Error reading from socket for %s: %s", ip.String(), err)
			continue
		}
		if peer.String() != ip.String() {
			continue
		}
		if bytes.Compare(rb[:n], wb) == 0 {
			success = true
			latency = time.Now().Sub(sendTime)
			return
		}
	}
	return
}
func main() {
	rand.Seed(time.Now().Unix())
	flag.Parse()

	// This is only used when we're running in -dev mode with bindata
	rootDir, _ = osext.ExecutableFolder()
	rootDir = path.Join(rootDir, "web")

	// Parse configuration
	cfg, err := config.LoadFromFile(*configFile)
	if err != nil {
		log.Fatalln("Error loading config", err)
	}

	// Templates
	amberTmpl, err := Asset("templates/index.amber")
	if err != nil {
		log.Fatalln("Could not load index template:", err)
	}
	tmpl := amber.MustCompile(string(amberTmpl), amber.Options{})

	// Setup the web UI
	router := httprouter.New()
	router.Handler("GET", *metricsPath, prometheus.Handler()) // Prometheus
	// Static asset handling
	router.GET("/static/*filepath", func(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
		reqpath := ps.ByName("filepath")
		realpath := path.Join("static", reqpath)
		b, err := Asset(realpath)
		if err != nil {
			log.Debugln("Could not find asset: ", err)
			return
		} else {
			w.Write(b)
		}

	})

	var monitoredHosts []*pollers.Host

	router.GET("/", func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
		data := struct {
			Cfg   *config.Config
			Hosts *[]*pollers.Host
		}{
			Cfg:   cfg,
			Hosts: &monitoredHosts,
		}
		err := tmpl.Execute(w, &data)
		if err != nil {
			log.Errorln("Error rendering template", err)
		}
	})

	// Initialize the host pollers
	monitoredHosts = make([]*pollers.Host, len(cfg.Hosts))

	// We don't allow duplicate hosts, but also don't want to panic just due
	// to a typo, so keep track and skip duplicates here.
	seenHosts := make(map[string]bool)

	realidx := 0
	for _, hostCfg := range cfg.Hosts {
		log.Debugln("Setting up poller for: ", hostCfg.Hostname)
		if *skipPing {
			hostCfg.PingDisable = true
		}
		if _, ok := seenHosts[hostCfg.Hostname]; ok {
			log.Warnln("Discarding repeat configuration of same hostname", hostCfg.Hostname)
			continue
		}
		host := pollers.NewHost(hostCfg)
		monitoredHosts[realidx] = host
		prometheus.MustRegister(host)

		seenHosts[hostCfg.Hostname] = true
		realidx++
	}

	// Trim monitoredHosts to the number we actually used
	monitoredHosts = monitoredHosts[0:realidx]

	// This is the dispatcher. It is responsible for invoking the doPoll method
	// of hosts.
	connectionLimiter := pollers.NewLimiter(*maxConnections)
	hostQueue := make(chan *pollers.Host)

	// Start the host dispatcher
	go func() {
		for host := range hostQueue {
			go host.Poll(connectionLimiter, hostQueue)
		}
	}()

	// Do the initial host dispatch
	go func() {
		for _, host := range monitoredHosts {
			log.Debugln("Starting polling for hosts")
			hostQueue <- host
		}
	}()

	var handler http.Handler

	// If basic auth is requested, enable it for the interface.
	if cfg.BasicAuthUsername != "" && cfg.BasicAuthPassword != "" {
		basicauth := httpauth.SimpleBasicAuth(cfg.BasicAuthUsername,
			cfg.BasicAuthPassword)
		handler = basicauth(router)
	} else {
		handler = router
	}

	// If TLS certificates are specificed, use TLS
	if cfg.TLSCertificatePath != "" && cfg.TLSKeyPath != "" {
		log.Infof("Listening on (TLS-enabled) %s", *listenAddress)
		err = http.ListenAndServeTLS(*listenAddress,
			cfg.TLSCertificatePath, cfg.TLSKeyPath, handler)
	} else {
		log.Infof("Listening on %s", *listenAddress)
		err = http.ListenAndServe(*listenAddress, handler)
	}

	if err != nil {
		log.Fatal(err)
	}
}
Example #16
0
func main() {
	flag.Parse()

	if *showVersion {
		fmt.Fprintln(os.Stdout, version.Print("pushgateway"))
		os.Exit(0)
	}

	log.Infoln("Starting pushgateway", version.Info())
	log.Infoln("Build context", version.BuildContext())

	flags := map[string]string{}
	flag.VisitAll(func(f *flag.Flag) {
		flags[f.Name] = f.Value.String()
	})

	ms := storage.NewDiskMetricStore(*persistenceFile, *persistenceInterval)
	prometheus.SetMetricFamilyInjectionHook(ms.GetMetricFamilies)
	// Enable collect checks for debugging.
	// prometheus.EnableCollectChecks(true)

	r := httprouter.New()
	r.Handler("GET", *metricsPath, prometheus.Handler())

	// Handlers for pushing and deleting metrics.
	r.PUT("/metrics/job/:job/*labels", handler.Push(ms, true))
	r.POST("/metrics/job/:job/*labels", handler.Push(ms, false))
	r.DELETE("/metrics/job/:job/*labels", handler.Delete(ms))
	r.PUT("/metrics/job/:job", handler.Push(ms, true))
	r.POST("/metrics/job/:job", handler.Push(ms, false))
	r.DELETE("/metrics/job/:job", handler.Delete(ms))

	// Handlers for the deprecated API.
	r.PUT("/metrics/jobs/:job/instances/:instance", handler.LegacyPush(ms, true))
	r.POST("/metrics/jobs/:job/instances/:instance", handler.LegacyPush(ms, false))
	r.DELETE("/metrics/jobs/:job/instances/:instance", handler.LegacyDelete(ms))
	r.PUT("/metrics/jobs/:job", handler.LegacyPush(ms, true))
	r.POST("/metrics/jobs/:job", handler.LegacyPush(ms, false))
	r.DELETE("/metrics/jobs/:job", handler.LegacyDelete(ms))

	r.Handler("GET", "/static/*filepath", prometheus.InstrumentHandler(
		"static",
		http.FileServer(
			&assetfs.AssetFS{Asset: Asset, AssetDir: AssetDir, AssetInfo: AssetInfo},
		),
	))
	statusHandler := prometheus.InstrumentHandlerFunc("status", handler.Status(ms, Asset, flags))
	r.Handler("GET", "/status", statusHandler)
	r.Handler("GET", "/", statusHandler)

	// Re-enable pprof.
	r.GET("/debug/pprof/*pprof", handlePprof)

	log.Infof("Listening on %s.", *listenAddress)
	l, err := net.Listen("tcp", *listenAddress)
	if err != nil {
		log.Fatal(err)
	}
	go interruptHandler(l)
	err = (&http.Server{Addr: *listenAddress, Handler: r}).Serve(l)
	log.Errorln("HTTP server stopped:", err)
	// To give running connections a chance to submit their payload, we wait
	// for 1sec, but we don't want to wait long (e.g. until all connections
	// are done) to not delay the shutdown.
	time.Sleep(time.Second)
	if err := ms.Shutdown(); err != nil {
		log.Errorln("Problem shutting down metric storage:", err)
	}
}
Example #17
0
func (b *Exporter) Listen(e <-chan Events) {
	for {
		events, ok := <-e
		if !ok {
			log.Debug("Channel is closed. Break out of Exporter.Listener.")
			return
		}
		for _, event := range events {
			metricName := ""
			prometheusLabels := event.Labels()

			labels, present := b.mapper.getMapping(event.MetricName())
			if present {
				metricName = labels["name"]
				for label, value := range labels {
					if label != "name" {
						prometheusLabels[label] = value
					}
				}
			} else {
				eventsUnmapped.Inc()
				metricName = escapeMetricName(event.MetricName())
			}

			switch event.(type) {
			case *CounterEvent:
				counter := b.Counters.Get(
					b.suffix(metricName, "counter"),
					prometheusLabels,
				)
				// We don't accept negative values for counters. Incrementing the counter with a negative number
				// will cause the exporter to panic. Instead we will warn and continue to the next event.
				if event.Value() < 0.0 {
					log.Errorf("Counter %q is: '%f' (counter must be non-negative value)", metricName, event.Value())
					continue
				}

				counter.Add(event.Value())

				eventStats.WithLabelValues("counter").Inc()

			case *GaugeEvent:
				gauge := b.Gauges.Get(
					b.suffix(metricName, "gauge"),
					prometheusLabels,
				)
				gauge.Set(event.Value())

				eventStats.WithLabelValues("gauge").Inc()

			case *TimerEvent:
				summary := b.Summaries.Get(
					b.suffix(metricName, "timer"),
					prometheusLabels,
				)
				summary.Observe(event.Value())

				eventStats.WithLabelValues("timer").Inc()

			default:
				log.Errorln("Unsupported event type")
				eventStats.WithLabelValues("illegal").Inc()
			}
		}
	}
}
func (e *Exporter) scrape(ch chan<- prometheus.Metric) {
	e.totalScrapes.Inc()
	var err error
	defer func(begun time.Time) {
		e.duration.Set(time.Since(begun).Seconds())
		if err == nil {
			e.error.Set(0)
		} else {
			e.error.Set(1)
		}
	}(time.Now())

	db, err := sql.Open("mysql", e.dsn)
	if err != nil {
		log.Errorln("Error opening connection to database:", err)
		return
	}
	defer db.Close()

	isUpRows, err := db.Query(upQuery)
	if err != nil {
		log.Errorln("Error pinging mysqld:", err)
		e.mysqldUp.Set(0)
		return
	}
	isUpRows.Close()

	e.mysqldUp.Set(1)

	if *slowLogFilter {
		sessionSettingsRows, err := db.Query(sessionSettingsQuery)
		if err != nil {
			log.Errorln("Error setting log_slow_filter:", err)
			return
		}
		sessionSettingsRows.Close()
	}

	if *collectGlobalStatus {
		if err = collector.ScrapeGlobalStatus(db, ch); err != nil {
			log.Errorln("Error scraping for collect.global_status:", err)
			e.scrapeErrors.WithLabelValues("collect.global_status").Inc()
		}
	}
	if *collectGlobalVariables {
		if err = collector.ScrapeGlobalVariables(db, ch); err != nil {
			log.Errorln("Error scraping for collect.global_variables:", err)
			e.scrapeErrors.WithLabelValues("collect.global_variables").Inc()
		}
	}
	if *collectSlaveStatus {
		if err = collector.ScrapeSlaveStatus(db, ch); err != nil {
			log.Errorln("Error scraping for collect.slave_status:", err)
			e.scrapeErrors.WithLabelValues("collect.slave_status").Inc()
		}
	}
	if *collectProcesslist {
		if err = collector.ScrapeProcesslist(db, ch); err != nil {
			log.Errorln("Error scraping for collect.info_schema.processlist:", err)
			e.scrapeErrors.WithLabelValues("collect.info_schema.processlist").Inc()
		}
	}
	if *collectTableSchema {
		if err = collector.ScrapeTableSchema(db, ch); err != nil {
			log.Errorln("Error scraping for collect.info_schema.tables:", err)
			e.scrapeErrors.WithLabelValues("collect.info_schema.tables").Inc()
		}
	}
	if *collectInnodbTablespaces {
		if err = collector.ScrapeInfoSchemaInnodbTablespaces(db, ch); err != nil {
			log.Errorln("Error scraping for collect.info_schema.innodb_sys_tablespaces:", err)
			e.scrapeErrors.WithLabelValues("collect.info_schema.innodb_sys_tablespaces").Inc()
		}
	}
	if *innodbMetrics {
		if err = collector.ScrapeInnodbMetrics(db, ch); err != nil {
			log.Errorln("Error scraping for collect.info_schema.innodb_metrics:", err)
			e.scrapeErrors.WithLabelValues("collect.info_schema.innodb_metrics").Inc()
		}
	}
	if *collectAutoIncrementColumns {
		if err = collector.ScrapeAutoIncrementColumns(db, ch); err != nil {
			log.Errorln("Error scraping for collect.auto_increment.columns:", err)
			e.scrapeErrors.WithLabelValues("collect.auto_increment.columns").Inc()
		}
	}
	if *collectBinlogSize {
		if err = collector.ScrapeBinlogSize(db, ch); err != nil {
			log.Errorln("Error scraping for collect.binlog_size:", err)
			e.scrapeErrors.WithLabelValues("collect.binlog_size").Inc()
		}
	}
	if *collectPerfTableIOWaits {
		if err = collector.ScrapePerfTableIOWaits(db, ch); err != nil {
			log.Errorln("Error scraping for collect.perf_schema.tableiowaits:", err)
			e.scrapeErrors.WithLabelValues("collect.perf_schema.tableiowaits").Inc()
		}
	}
	if *collectPerfIndexIOWaits {
		if err = collector.ScrapePerfIndexIOWaits(db, ch); err != nil {
			log.Errorln("Error scraping for collect.perf_schema.indexiowaits:", err)
			e.scrapeErrors.WithLabelValues("collect.perf_schema.indexiowaits").Inc()
		}
	}
	if *collectPerfTableLockWaits {
		if err = collector.ScrapePerfTableLockWaits(db, ch); err != nil {
			log.Errorln("Error scraping for collect.perf_schema.tablelocks:", err)
			e.scrapeErrors.WithLabelValues("collect.perf_schema.tablelocks").Inc()
		}
	}
	if *collectPerfEventsStatements {
		if err = collector.ScrapePerfEventsStatements(db, ch); err != nil {
			log.Errorln("Error scraping for collect.perf_schema.eventsstatements:", err)
			e.scrapeErrors.WithLabelValues("collect.perf_schema.eventsstatements").Inc()
		}
	}
	if *collectPerfEventsWaits {
		if err = collector.ScrapePerfEventsWaits(db, ch); err != nil {
			log.Errorln("Error scraping for collect.perf_schema.eventswaits:", err)
			e.scrapeErrors.WithLabelValues("collect.perf_schema.eventswaits").Inc()
		}
	}
	if *collectPerfFileEvents {
		if err = collector.ScrapePerfFileEvents(db, ch); err != nil {
			log.Errorln("Error scraping for collect.perf_schema.file_events:", err)
			e.scrapeErrors.WithLabelValues("collect.perf_schema.file_events").Inc()
		}
	}
	if *collectUserStat {
		if err = collector.ScrapeUserStat(db, ch); err != nil {
			log.Errorln("Error scraping for collect.info_schema.userstats:", err)
			e.scrapeErrors.WithLabelValues("collect.info_schema.userstats").Inc()
		}
	}
	if *collectClientStat {
		if err = collector.ScrapeClientStat(db, ch); err != nil {
			log.Errorln("Error scraping for collect.info_schema.clientstats:", err)
			e.scrapeErrors.WithLabelValues("collect.info_schema.clientstats").Inc()
		}
	}
	if *collectTableStat {
		if err = collector.ScrapeTableStat(db, ch); err != nil {
			log.Errorln("Error scraping for collect.info_schema.tablestats:", err)
			e.scrapeErrors.WithLabelValues("collect.info_schema.tablestats").Inc()
		}
	}
	if *collectQueryResponseTime {
		if err = collector.ScrapeQueryResponseTime(db, ch); err != nil {
			log.Errorln("Error scraping for collect.info_schema.query_response_time:", err)
			e.scrapeErrors.WithLabelValues("collect.info_schema.query_response_time").Inc()
		}
	}
	if *collectEngineTokudbStatus {
		if err = collector.ScrapeEngineTokudbStatus(db, ch); err != nil {
			log.Errorln("Error scraping for collect.engine_tokudb_status:", err)
			e.scrapeErrors.WithLabelValues("collect.engine_tokudb_status").Inc()
		}
	}
	if *collectEngineInnodbStatus {
		if err = collector.ScrapeEngineInnodbStatus(db, ch); err != nil {
			log.Errorln("Error scraping for collect.engine_innodb_status:", err)
			e.scrapeErrors.WithLabelValues("collect.engine_innodb_status").Inc()
		}
	}
}