// stop background processing of the target manager. If removeTargets is true, // existing targets will be stopped and removed. func (tm *TargetManager) stop(removeTargets bool) { log.Info("Stopping target manager...") defer log.Info("Target manager stopped.") tm.m.Lock() provs := []TargetProvider{} for _, ps := range tm.providers { provs = append(provs, ps...) } tm.m.Unlock() var wg sync.WaitGroup wg.Add(len(provs)) for _, prov := range provs { go func(p TargetProvider) { p.Stop() wg.Done() }(prov) } wg.Wait() tm.m.Lock() defer tm.m.Unlock() if removeTargets { tm.removeTargets(nil) } tm.running = false }
func (p *persistence) rebuildLabelIndexes( fpToSeries map[clientmodel.Fingerprint]*memorySeries, ) error { count := 0 log.Info("Rebuilding label indexes.") log.Info("Indexing metrics in memory.") for fp, s := range fpToSeries { p.indexMetric(fp, s.metric) count++ if count%10000 == 0 { log.Infof("%d metrics queued for indexing.", count) } } log.Info("Indexing archived metrics.") var fp codable.Fingerprint var m codable.Metric if err := p.archivedFingerprintToMetrics.ForEach(func(kv index.KeyValueAccessor) error { if err := kv.Key(&fp); err != nil { return err } if err := kv.Value(&m); err != nil { return err } p.indexMetric(clientmodel.Fingerprint(fp), clientmodel.Metric(m)) count++ if count%10000 == 0 { log.Infof("%d metrics queued for indexing.", count) } return nil }); err != nil { return err } log.Info("All requests for rebuilding the label indexes queued. (Actual processing may lag behind.)") return nil }
// stop background processing of the target manager. If removeTargets is true, // existing targets will be stopped and removed. func (tm *TargetManager) stop(removeTargets bool) { log.Info("Stopping target manager...") defer log.Info("Target manager stopped.") close(tm.done) tm.mtx.Lock() defer tm.mtx.Unlock() if removeTargets { tm.removeTargets(nil) } tm.running = false }
func (w *fileWatcher) Watch(cb ReloadCallback) { watcher, err := fsnotify.NewWatcher() if err != nil { log.Fatal(err) } err = watcher.WatchFlags(w.fileName, fsnotify.FSN_MODIFY) if err != nil { log.Fatal(err) } for { select { case ev := <-watcher.Event: log.Infof("Config file changed (%s), attempting reload", ev) conf, err := LoadFromFile(w.fileName) if err != nil { log.Error("Error loading new config: ", err) failedConfigReloads.Inc() } else { cb(&conf) log.Info("Config reloaded successfully") configReloads.Inc() } // Re-add the file watcher since it can get lost on some changes. E.g. // saving a file with vim results in a RENAME-MODIFY-DELETE event // sequence, after which the newly written file is no longer watched. err = watcher.WatchFlags(w.fileName, fsnotify.FSN_MODIFY) case err := <-watcher.Error: log.Error("Error watching config: ", err) } } }
func newMesosExporter(opts *exporterOpts) *periodicExporter { e := &periodicExporter{ errors: prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: "mesos_exporter", Name: "slave_scrape_errors_total", Help: "Current total scrape errors", }, []string{"slave"}, ), opts: opts, } e.slaves.urls = []string{e.opts.localURL} if e.opts.autoDiscover { log.Info("auto discovery enabled from command line flag.") // Update nr. of mesos slaves every 10 minutes e.updateSlaves() go runEvery(e.updateSlaves, 10*time.Minute) } // Fetch slave metrics every interval go runEvery(e.scrapeSlaves, e.opts.interval) return e }
// GetJson return json from server func GetJson(url string, accessKey string, secretKey string, target interface{}) error { start := time.Now() // Counter for internal exporter metrics measure.FunctionCountTotal.With(prometheus.Labels{"pkg": "utils", "fnc": "GetJson"}).Inc() log.Info("Scraping: ", url) client := &http.Client{} req, err := http.NewRequest("GET", url, nil) req.SetBasicAuth(accessKey, secretKey) resp, err := client.Do(req) if err != nil { log.Error("Error Collecting JSON from API: ", err) panic(err) } // Timings recorded as part of internal metrics elapsed := float64((time.Since(start)) / time.Microsecond) measure.FunctionDurations.WithLabelValues("hosts", "getJSON").Observe(elapsed) return json.NewDecoder(resp.Body).Decode(target) }
// Run starts background processing to handle target updates. func (tm *TargetManager) Run() { log.Info("Starting target manager...") sources := map[string]struct{}{} for scfg, provs := range tm.providers { for _, prov := range provs { ch := make(chan *config.TargetGroup) go tm.handleTargetUpdates(scfg, ch) for _, src := range prov.Sources() { src = fullSource(scfg, src) sources[src] = struct{}{} } // Run the target provider after cleanup of the stale targets is done. defer func(p TargetProvider, c chan *config.TargetGroup) { go p.Run(c) }(prov, ch) } } tm.m.Lock() defer tm.m.Unlock() tm.removeTargets(func(src string) bool { if _, ok := sources[src]; ok { return false } return true }) tm.running = true }
// Run starts background processing to handle target updates. func (tm *TargetManager) Run() { log.Info("Starting target manager...") tm.done = make(chan struct{}) sources := map[string]struct{}{} updates := []<-chan targetGroupUpdate{} for scfg, provs := range tm.providers { for _, prov := range provs { // Get an initial set of available sources so we don't remove // target groups from the last run that are still available. for _, src := range prov.Sources() { sources[src] = struct{}{} } tgc := make(chan *config.TargetGroup) // Run the target provider after cleanup of the stale targets is done. defer func(prov TargetProvider, tgc chan *config.TargetGroup) { go prov.Run(tgc, tm.done) }(prov, tgc) tgupc := make(chan targetGroupUpdate) updates = append(updates, tgupc) go func(scfg *config.ScrapeConfig) { defer close(tgupc) for { select { case tg := <-tgc: if tg == nil { break } tgupc <- targetGroupUpdate{tg: tg, scfg: scfg} case <-tm.done: return } } }(scfg) } } // Merge all channels of incoming target group updates into a single // one and keep applying the updates. go tm.handleUpdates(merge(tm.done, updates...), tm.done) tm.mtx.Lock() defer tm.mtx.Unlock() // Remove old target groups that are no longer in the set of sources. tm.removeTargets(func(src string) bool { if _, ok := sources[src]; ok { return false } return true }) tm.running = true }
func startPolling() { t := time.NewTicker(time.Duration(*pollRate) * time.Minute) log.Info("Starting polling of tanks.") for { readTanks() <-t.C } }
// checkpointFPMappings persists the fingerprint mappings. This method is not // goroutine-safe. // // Description of the file format, v1: // // (1) Magic string (const mappingsMagicString). // // (2) Uvarint-encoded format version (const mappingsFormatVersion). // // (3) Uvarint-encoded number of mappings in fpMappings. // // (4) Repeated once per mapping: // // (4.1) The raw fingerprint as big-endian uint64. // // (4.2) The uvarint-encoded number of sub-mappings for the raw fingerprint. // // (4.3) Repeated once per sub-mapping: // // (4.3.1) The uvarint-encoded length of the unique metric string. // (4.3.2) The unique metric string. // (4.3.3) The mapped fingerprint as big-endian uint64. func (p *persistence) checkpointFPMappings(fpm fpMappings) (err error) { log.Info("Checkpointing fingerprint mappings...") begin := time.Now() f, err := os.OpenFile(p.mappingsTempFileName(), os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0640) if err != nil { return } defer func() { f.Sync() closeErr := f.Close() if err != nil { return } err = closeErr if err != nil { return } err = os.Rename(p.mappingsTempFileName(), p.mappingsFileName()) duration := time.Since(begin) log.Infof("Done checkpointing fingerprint mappings in %v.", duration) }() w := bufio.NewWriterSize(f, fileBufSize) if _, err = w.WriteString(mappingsMagicString); err != nil { return } if _, err = codable.EncodeUvarint(w, mappingsFormatVersion); err != nil { return } if _, err = codable.EncodeUvarint(w, uint64(len(fpm))); err != nil { return } for fp, mappings := range fpm { if err = codable.EncodeUint64(w, uint64(fp)); err != nil { return } if _, err = codable.EncodeUvarint(w, uint64(len(mappings))); err != nil { return } for ms, mappedFP := range mappings { if _, err = codable.EncodeUvarint(w, uint64(len(ms))); err != nil { return } if _, err = w.WriteString(ms); err != nil { return } if err = codable.EncodeUint64(w, uint64(mappedFP)); err != nil { return } } } err = w.Flush() return }
// Stop stops sending samples to the remote storage and waits for pending // sends to complete. func (t *StorageQueueManager) Stop() { log.Infof("Stopping remote storage...") close(t.queue) <-t.drained for i := 0; i < maxConcurrentSends; i++ { t.sendSemaphore <- true } log.Info("Remote storage stopped.") }
// StacksURLCheck - Checks the API version for Rancher to determine the correct URL func StacksURLCheck(rancherURL string) string { var stacksEndpoint string if strings.Contains(rancherURL, "v1") { log.Info("Version 1 API detected, using legacy API fields") stacksEndpoint = (rancherURL + "/environments/") } else if strings.Contains(rancherURL, "v2") { log.Info("Version 2 API detected, using updated API fields") stacksEndpoint = (rancherURL + "/stacks/") } else { log.Info("No known API version detected, defaulting to /stacks/") stacksEndpoint = (rancherURL + "/stacks/") } return stacksEndpoint }
func newMesosExporter(opts *exporterOpts) *periodicExporter { e := &periodicExporter{ errors: prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: "mesos_exporter", Name: "slave_scrape_errors_total", Help: "Current total scrape errors", }, []string{"slave"}, ), opts: opts, } if opts.queryURL == "" { log.Fatal("Flag '-exporter.url' not set") } switch opts.mode { case "discover": log.Info("starting mesos_exporter in scrape mode 'discover'") e.queryURL = parseMasterURL(opts.queryURL) // Update nr. of mesos slaves. e.updateSlaves() go runEvery(e.updateSlaves, e.opts.autoDiscoverInterval) // Fetch slave metrics every interval. go runEvery(e.scrapeSlaves, e.opts.interval) case "master": log.Info("starting mesos_exporter in scrape mode 'master'") e.queryURL = parseMasterURL(opts.queryURL) case "slave": log.Info("starting mesos_exporter in scrape mode 'slave'") e.slaves.urls = []string{opts.queryURL} default: log.Fatalf("Invalid value '%s' of flag '-exporter.mode' - must be one of 'discover', 'master' or 'slave'", opts.mode) } return e }
// Stop implements Storage. func (s *memorySeriesStorage) Stop() error { log.Info("Stopping local storage...") log.Info("Stopping maintenance loop...") close(s.loopStopping) <-s.loopStopped log.Info("Stopping chunk eviction...") close(s.evictStopping) <-s.evictStopped // One final checkpoint of the series map and the head chunks. if err := s.persistence.checkpointSeriesMapAndHeads(s.fpToSeries, s.fpLocker); err != nil { return err } if err := s.persistence.close(); err != nil { return err } log.Info("Local storage stopped.") return nil }
func (s *memorySeriesStorage) loop() { checkpointTimer := time.NewTimer(s.checkpointInterval) dirtySeriesCount := 0 defer func() { checkpointTimer.Stop() log.Info("Maintenance loop stopped.") close(s.loopStopped) }() memoryFingerprints := s.cycleThroughMemoryFingerprints() archivedFingerprints := s.cycleThroughArchivedFingerprints() loop: for { select { case <-s.loopStopping: break loop case <-checkpointTimer.C: err := s.persistence.checkpointSeriesMapAndHeads(s.fpToSeries, s.fpLocker) if err != nil { log.Errorln("Error while checkpointing:", err) } else { dirtySeriesCount = 0 } checkpointTimer.Reset(s.checkpointInterval) case fp := <-memoryFingerprints: if s.maintainMemorySeries(fp, model.Now().Add(-s.dropAfter)) { dirtySeriesCount++ // Check if we have enough "dirty" series so that we need an early checkpoint. // However, if we are already behind persisting chunks, creating a checkpoint // would be counterproductive, as it would slow down chunk persisting even more, // while in a situation like that, where we are clearly lacking speed of disk // maintenance, the best we can do for crash recovery is to persist chunks as // quickly as possible. So only checkpoint if the storage is not in "graceful // degradation mode". if dirtySeriesCount >= s.checkpointDirtySeriesLimit && !s.isDegraded() { checkpointTimer.Reset(0) } } case fp := <-archivedFingerprints: s.maintainArchivedSeries(fp, model.Now().Add(-s.dropAfter)) } } // Wait until both channels are closed. for range memoryFingerprints { } for range archivedFingerprints { } }
func main() { flag.Parse() exporter := newZooKeeperExporter(flag.Args(), *useExhibitor) prometheus.MustRegister(exporter) http.Handle(*metricPath, prometheus.Handler()) http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { http.Redirect(w, r, *metricPath, http.StatusMovedPermanently) }) log.Info("starting mesos_exporter on ", *addr) log.Fatal(http.ListenAndServe(*addr, nil)) }
func main() { flag.Parse() if rancherURL == "" { log.Fatal("CATTLE_URL must be set and non-empty") } log.Info("Starting Prometheus Exporter for Rancher. Listen Address: ", listenAddress, " metricsPath: ", metricsPath, " rancherURL: ", rancherURL, " AccessKey: ", accessKey) log.Info("System Services Reported on:", hideSys) // Register internal metrics measure.Init() // Pass URL & Credentials out to the Exporters servicesExporter := services.NewExporter(rancherURL, accessKey, secretKey, hideSys) stacksExporter := stacks.NewExporter(rancherURL, accessKey, secretKey, hideSys) hostsExporter := hosts.NewExporter(rancherURL, accessKey, secretKey) // Register Metrics from each of the endpoints prometheus.MustRegister(servicesExporter) prometheus.MustRegister(stacksExporter) prometheus.MustRegister(hostsExporter) http.Handle(metricsPath, prometheus.Handler()) http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { w.Write([]byte(`<html> <head><title>Rancher exporter</title></head> <body> <h1>rancher exporter</h1> <p><a href='` + metricsPath + `'>Metrics</a></p> </body> </html> `)) }) log.Infof("Starting Server: %s", listenAddress) log.Fatal(http.ListenAndServe(listenAddress, nil)) }
// Start implements Storage. func (s *memorySeriesStorage) Start() (err error) { var syncStrategy syncStrategy switch s.options.SyncStrategy { case Never: syncStrategy = func() bool { return false } case Always: syncStrategy = func() bool { return true } case Adaptive: syncStrategy = func() bool { return !s.isDegraded() } default: panic("unknown sync strategy") } var p *persistence p, err = newPersistence(s.options.PersistenceStoragePath, s.options.Dirty, s.options.PedanticChecks, syncStrategy) if err != nil { return err } s.persistence = p // Persistence must start running before loadSeriesMapAndHeads() is called. go s.persistence.run() defer func() { if err != nil { if e := p.close(); e != nil { log.Errorln("Error closing persistence:", e) } } }() log.Info("Loading series map and head chunks...") s.fpToSeries, s.numChunksToPersist, err = p.loadSeriesMapAndHeads() if err != nil { return err } log.Infof("%d series loaded.", s.fpToSeries.length()) s.numSeries.Set(float64(s.fpToSeries.length())) s.mapper, err = newFPMapper(s.fpToSeries, p) if err != nil { return err } go s.handleEvictList() go s.loop() return nil }
func (s *memorySeriesStorage) handleEvictList() { ticker := time.NewTicker(maxEvictInterval) count := 0 for { // To batch up evictions a bit, this tries evictions at least // once per evict interval, but earlier if the number of evict // requests with evict==true that have happened since the last // evict run is more than maxMemoryChunks/1000. select { case req := <-s.evictRequests: if req.evict { req.cd.evictListElement = s.evictList.PushBack(req.cd) count++ if count > s.maxMemoryChunks/1000 { s.maybeEvict() count = 0 } } else { if req.cd.evictListElement != nil { s.evictList.Remove(req.cd.evictListElement) req.cd.evictListElement = nil } } case <-ticker.C: if s.evictList.Len() > 0 { s.maybeEvict() } case <-s.evictStopping: // Drain evictRequests forever in a goroutine to not let // requesters hang. go func() { for { <-s.evictRequests } }() ticker.Stop() log.Info("Chunk eviction stopped.") close(s.evictStopped) return } } }
func testChunk(t *testing.T, encoding chunkEncoding) { samples := make(model.Samples, 500000) for i := range samples { samples[i] = &model.Sample{ Timestamp: model.Time(i), Value: model.SampleValue(float64(i) * 0.2), } } s, closer := NewTestStorage(t, encoding) defer closer.Close() for _, sample := range samples { s.Append(sample) } s.WaitForIndexing() for m := range s.fpToSeries.iter() { s.fpLocker.Lock(m.fp) var values []model.SamplePair for _, cd := range m.series.chunkDescs { if cd.isEvicted() { continue } for sample := range cd.c.newIterator().values() { values = append(values, *sample) } } for i, v := range values { if samples[i].Timestamp != v.Timestamp { t.Errorf("%d. Got %v; want %v", i, v.Timestamp, samples[i].Timestamp) } if samples[i].Value != v.Value { t.Errorf("%d. Got %v; want %v", i, v.Value, samples[i].Value) } } s.fpLocker.Unlock(m.fp) } log.Info("test done, closing") }
func (e *Exporter) gatherMetrics(rancherURL string, accessKey string, secretKey string, ch chan<- prometheus.Metric) error { // Reset guageVecs back to 0 for _, m := range e.gaugeVecs { m.Reset() } // Set the correct API endpoint for hosts endpoint := (rancherURL + "/hosts/") // Scrape EndPoint for JSON Data data := new(Data) err := utils.GetJson(endpoint, accessKey, secretKey, &data) if err != nil { log.Error("Error getting JSON from URL ", endpoint) return err } log.Info("JSON Fetched for hosts: ", data) // Host Metrics for _, x := range data.Data { // Pre-defines the known states from the Rancher API states := []string{"activating", "active", "deactivating", "error", "erroring", "inactive", "provisioned", "purged", "purging", "registering", "removed", "removing", "requested", "restoring", "updating_active", "updating_inactive"} // Set the state of the service to 1 when it matches one of the known states for _, y := range states { if x.State == y { e.gaugeVecs["HostState"].With(prometheus.Labels{"rancherURL": rancherURL, "name": x.Hostname, "state": y}).Set(1) } else { e.gaugeVecs["HostState"].With(prometheus.Labels{"rancherURL": rancherURL, "name": x.Hostname, "state": y}).Set(0) } } } return nil }
func (w WebService) ServeForever(pathPrefix string) error { http.Handle(pathPrefix+"favicon.ico", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { http.Error(w, "", 404) })) http.HandleFunc("/", prometheus.InstrumentHandlerFunc("index", func(rw http.ResponseWriter, req *http.Request) { // The "/" pattern matches everything, so we need to check // that we're at the root here. if req.URL.Path == pathPrefix { w.AlertsHandler.ServeHTTP(rw, req) } else if req.URL.Path == strings.TrimRight(pathPrefix, "/") { http.Redirect(rw, req, pathPrefix, http.StatusFound) } else if !strings.HasPrefix(req.URL.Path, pathPrefix) { // We're running under a prefix but the user requested something // outside of it. Let's see if this page exists under the prefix. http.Redirect(rw, req, pathPrefix+strings.TrimLeft(req.URL.Path, "/"), http.StatusFound) } else { http.NotFound(rw, req) } })) http.Handle(pathPrefix+"alerts", prometheus.InstrumentHandler("alerts", w.AlertsHandler)) http.Handle(pathPrefix+"silences", prometheus.InstrumentHandler("silences", w.SilencesHandler)) http.Handle(pathPrefix+"status", prometheus.InstrumentHandler("status", w.StatusHandler)) http.Handle(pathPrefix+"metrics", prometheus.Handler()) if *useLocalAssets { http.Handle(pathPrefix+"static/", http.StripPrefix(pathPrefix+"static/", http.FileServer(http.Dir("web/static")))) } else { http.Handle(pathPrefix+"static/", http.StripPrefix(pathPrefix+"static/", new(blob.Handler))) } http.Handle(pathPrefix+"api/", w.AlertManagerService.Handler()) log.Info("listening on ", *listenAddress) return http.ListenAndServe(*listenAddress, nil) }
// Run the rule manager's periodic rule evaluation. func (m *Manager) Run() { defer log.Info("Rule manager stopped.") m.Lock() lastInterval := m.interval m.Unlock() ticker := time.NewTicker(lastInterval) defer ticker.Stop() for { // The outer select clause makes sure that m.done is looked at // first. Otherwise, if m.runIteration takes longer than // m.interval, there is only a 50% chance that m.done will be // looked at before the next m.runIteration call happens. select { case <-m.done: return default: select { case <-ticker.C: start := time.Now() m.runIteration() iterationDuration.Observe(float64(time.Since(start) / time.Millisecond)) m.Lock() if lastInterval != m.interval { ticker.Stop() ticker = time.NewTicker(m.interval) lastInterval = m.interval } m.Unlock() case <-m.done: return } } } }
func main() { flag.Parse() opts := &exporterOpts{ autoDiscoverInterval: *autoDiscoverInterval, interval: *scrapeInterval, mode: *scrapeMode, queryURL: strings.TrimRight(*queryURL, "/"), } exporter := newMesosExporter(opts) prometheus.MustRegister(exporter) http.Handle(*metricsPath, prometheus.Handler()) http.HandleFunc("/status", func(w http.ResponseWriter, r *http.Request) { fmt.Fprintf(w, "OK") }) http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { http.Redirect(w, r, *metricsPath, http.StatusMovedPermanently) }) log.Info("starting mesos_exporter on ", *addr) log.Fatal(http.ListenAndServe(*addr, nil)) }
// Stop the rule manager's rule evaluation cycles. func (m *Manager) Stop() { log.Info("Stopping rule manager...") m.done <- true }
// recoverFromCrash is called by loadSeriesMapAndHeads if the persistence // appears to be dirty after the loading (either because the loading resulted in // an error or because the persistence was dirty from the start). Not goroutine // safe. Only call before anything else is running (except index processing // queue as started by newPersistence). func (p *persistence) recoverFromCrash(fingerprintToSeries map[clientmodel.Fingerprint]*memorySeries) error { // TODO(beorn): We need proper tests for the crash recovery. log.Warn("Starting crash recovery. Prometheus is inoperational until complete.") log.Warn("To avoid crash recovery in future, shutdown Prometheus with SIGTERM or a HTTP POST to /-/quit.") fpsSeen := map[clientmodel.Fingerprint]struct{}{} count := 0 seriesDirNameFmt := fmt.Sprintf("%%0%dx", seriesDirNameLen) // Delete the fingerprint mapping file as it might be stale or // corrupt. We'll rebuild the mappings as we go. os.Remove(p.mappingsFileName()) // The mappings to rebuild. fpm := fpMappings{} log.Info("Scanning files.") for i := 0; i < 1<<(seriesDirNameLen*4); i++ { dirname := path.Join(p.basePath, fmt.Sprintf(seriesDirNameFmt, i)) dir, err := os.Open(dirname) if os.IsNotExist(err) { continue } if err != nil { return err } defer dir.Close() for fis := []os.FileInfo{}; err != io.EOF; fis, err = dir.Readdir(1024) { if err != nil { return err } for _, fi := range fis { fp, ok := p.sanitizeSeries(dirname, fi, fingerprintToSeries, fpm) if ok { fpsSeen[fp] = struct{}{} } count++ if count%10000 == 0 { log.Infof("%d files scanned.", count) } } } } log.Infof("File scan complete. %d series found.", len(fpsSeen)) log.Info("Checking for series without series file.") for fp, s := range fingerprintToSeries { if _, seen := fpsSeen[fp]; !seen { // fp exists in fingerprintToSeries, but has no representation on disk. if s.persistWatermark == len(s.chunkDescs) { // Oops, everything including the head chunk was // already persisted, but nothing on disk. // Thus, we lost that series completely. Clean // up the remnants. delete(fingerprintToSeries, fp) if err := p.purgeArchivedMetric(fp); err != nil { // Purging the archived metric didn't work, so try // to unindex it, just in case it's in the indexes. p.unindexMetric(fp, s.metric) } log.Warnf("Lost series detected: fingerprint %v, metric %v.", fp, s.metric) continue } // If we are here, the only chunks we have are the chunks in the checkpoint. // Adjust things accordingly. if s.persistWatermark > 0 || s.chunkDescsOffset != 0 { minLostChunks := s.persistWatermark + s.chunkDescsOffset if minLostChunks <= 0 { log.Warnf( "Possible loss of chunks for fingerprint %v, metric %v.", fp, s.metric, ) } else { log.Warnf( "Lost at least %d chunks for fingerprint %v, metric %v.", minLostChunks, fp, s.metric, ) } s.chunkDescs = append( make([]*chunkDesc, 0, len(s.chunkDescs)-s.persistWatermark), s.chunkDescs[s.persistWatermark:]..., ) numMemChunkDescs.Sub(float64(s.persistWatermark)) s.persistWatermark = 0 s.chunkDescsOffset = 0 } maybeAddMapping(fp, s.metric, fpm) fpsSeen[fp] = struct{}{} // Add so that fpsSeen is complete. } } log.Info("Check for series without series file complete.") if err := p.cleanUpArchiveIndexes(fingerprintToSeries, fpsSeen, fpm); err != nil { return err } if err := p.rebuildLabelIndexes(fingerprintToSeries); err != nil { return err } // Finally rewrite the mappings file if there are any mappings. if len(fpm) > 0 { if err := p.checkpointFPMappings(fpm); err != nil { return err } } p.setDirty(false) log.Warn("Crash recovery complete.") return nil }
// Main manages the startup and shutdown lifecycle of the entire Prometheus server. func Main() int { if err := parse(os.Args[1:]); err != nil { return 2 } printVersion() if cfg.printVersion { return 0 } var reloadables []Reloadable var ( memStorage = local.NewMemorySeriesStorage(&cfg.storage) remoteStorage = remote.New(&cfg.remote) sampleAppender = storage.Fanout{memStorage} ) if remoteStorage != nil { sampleAppender = append(sampleAppender, remoteStorage) reloadables = append(reloadables, remoteStorage) } var ( notificationHandler = notification.NewNotificationHandler(&cfg.notification) targetManager = retrieval.NewTargetManager(sampleAppender) queryEngine = promql.NewEngine(memStorage, &cfg.queryEngine) ) ruleManager := rules.NewManager(&rules.ManagerOptions{ SampleAppender: sampleAppender, NotificationHandler: notificationHandler, QueryEngine: queryEngine, ExternalURL: cfg.web.ExternalURL, }) flags := map[string]string{} cfg.fs.VisitAll(func(f *flag.Flag) { flags[f.Name] = f.Value.String() }) status := &web.PrometheusStatus{ TargetPools: targetManager.Pools, Rules: ruleManager.Rules, Flags: flags, Birth: time.Now(), } webHandler := web.New(memStorage, queryEngine, ruleManager, status, &cfg.web) reloadables = append(reloadables, status, targetManager, ruleManager, webHandler, notificationHandler) if !reloadConfig(cfg.configFile, reloadables...) { return 1 } // Wait for reload or termination signals. Start the handler for SIGHUP as // early as possible, but ignore it until we are ready to handle reloading // our config. hup := make(chan os.Signal) hupReady := make(chan bool) signal.Notify(hup, syscall.SIGHUP) go func() { <-hupReady for { select { case <-hup: case <-webHandler.Reload(): } reloadConfig(cfg.configFile, reloadables...) } }() // Start all components. if err := memStorage.Start(); err != nil { log.Errorln("Error opening memory series storage:", err) return 1 } defer func() { if err := memStorage.Stop(); err != nil { log.Errorln("Error stopping storage:", err) } }() if remoteStorage != nil { prometheus.MustRegister(remoteStorage) go remoteStorage.Run() defer remoteStorage.Stop() } // The storage has to be fully initialized before registering. prometheus.MustRegister(memStorage) prometheus.MustRegister(notificationHandler) prometheus.MustRegister(configSuccess) prometheus.MustRegister(configSuccessTime) go ruleManager.Run() defer ruleManager.Stop() go notificationHandler.Run() defer notificationHandler.Stop() go targetManager.Run() defer targetManager.Stop() defer queryEngine.Stop() go webHandler.Run() // Wait for reload or termination signals. close(hupReady) // Unblock SIGHUP handler. term := make(chan os.Signal) signal.Notify(term, os.Interrupt, syscall.SIGTERM) select { case <-term: log.Warn("Received SIGTERM, exiting gracefully...") case <-webHandler.Quit(): log.Warn("Received termination request via web service, exiting gracefully...") case err := <-webHandler.ListenError(): log.Errorln("Error starting web server, exiting gracefully:", err) } log.Info("See you next time!") return 0 }
func Main() int { if err := parse(os.Args[1:]); err != nil { return 2 } versionInfoTmpl.Execute(os.Stdout, BuildInfo) if cfg.printVersion { return 0 } memStorage := local.NewMemorySeriesStorage(&cfg.storage) var ( sampleAppender storage.SampleAppender remoteStorageQueues []*remote.StorageQueueManager ) if cfg.opentsdbURL == "" && cfg.influxdbURL == "" { log.Warnf("No remote storage URLs provided; not sending any samples to long-term storage") sampleAppender = memStorage } else { fanout := storage.Fanout{memStorage} addRemoteStorage := func(c remote.StorageClient) { qm := remote.NewStorageQueueManager(c, 100*1024) fanout = append(fanout, qm) remoteStorageQueues = append(remoteStorageQueues, qm) } if cfg.opentsdbURL != "" { addRemoteStorage(opentsdb.NewClient(cfg.opentsdbURL, cfg.remoteStorageTimeout)) } if cfg.influxdbURL != "" { addRemoteStorage(influxdb.NewClient(cfg.influxdbURL, cfg.remoteStorageTimeout, cfg.influxdbDatabase, cfg.influxdbRetentionPolicy)) } sampleAppender = fanout } var ( notificationHandler = notification.NewNotificationHandler(&cfg.notification) targetManager = retrieval.NewTargetManager(sampleAppender) queryEngine = promql.NewEngine(memStorage, &cfg.queryEngine) ) ruleManager := rules.NewManager(&rules.ManagerOptions{ SampleAppender: sampleAppender, NotificationHandler: notificationHandler, QueryEngine: queryEngine, PrometheusURL: cfg.prometheusURL, PathPrefix: cfg.web.PathPrefix, }) flags := map[string]string{} cfg.fs.VisitAll(func(f *flag.Flag) { flags[f.Name] = f.Value.String() }) status := &web.PrometheusStatus{ BuildInfo: BuildInfo, TargetPools: targetManager.Pools, Rules: ruleManager.Rules, Flags: flags, Birth: time.Now(), } webHandler := web.New(memStorage, queryEngine, ruleManager, status, &cfg.web) if !reloadConfig(cfg.configFile, status, targetManager, ruleManager) { os.Exit(1) } // Wait for reload or termination signals. Start the handler for SIGHUP as // early as possible, but ignore it until we are ready to handle reloading // our config. hup := make(chan os.Signal) hupReady := make(chan bool) signal.Notify(hup, syscall.SIGHUP) go func() { <-hupReady for range hup { reloadConfig(cfg.configFile, status, targetManager, ruleManager) } }() // Start all components. if err := memStorage.Start(); err != nil { log.Errorln("Error opening memory series storage:", err) return 1 } defer func() { if err := memStorage.Stop(); err != nil { log.Errorln("Error stopping storage:", err) } }() // The storage has to be fully initialized before registering. registry.MustRegister(memStorage) registry.MustRegister(notificationHandler) for _, q := range remoteStorageQueues { registry.MustRegister(q) go q.Run() defer q.Stop() } go ruleManager.Run() defer ruleManager.Stop() go notificationHandler.Run() defer notificationHandler.Stop() go targetManager.Run() defer targetManager.Stop() defer queryEngine.Stop() go webHandler.Run() // Wait for reload or termination signals. close(hupReady) // Unblock SIGHUP handler. term := make(chan os.Signal) signal.Notify(term, os.Interrupt, syscall.SIGTERM) select { case <-term: log.Warn("Received SIGTERM, exiting gracefully...") case <-webHandler.Quit(): log.Warn("Received termination request via web service, exiting gracefully...") } close(hup) log.Info("See you next time!") return 0 }
// Stop shuts down the notification handler. func (n *NotificationHandler) Stop() { log.Info("Stopping notification handler...") close(n.pendingNotifications) <-n.stopped log.Info("Notification handler stopped.") }
// checkpointSeriesMapAndHeads persists the fingerprint to memory-series mapping // and all non persisted chunks. Do not call concurrently with // loadSeriesMapAndHeads. This method will only write heads format v2, but // loadSeriesMapAndHeads can also understand v1. // // Description of the file format (for both, v1 and v2): // // (1) Magic string (const headsMagicString). // // (2) Varint-encoded format version (const headsFormatVersion). // // (3) Number of series in checkpoint as big-endian uint64. // // (4) Repeated once per series: // // (4.1) A flag byte, see flag constants above. (Present but unused in v2.) // // (4.2) The fingerprint as big-endian uint64. // // (4.3) The metric as defined by codable.Metric. // // (4.4) The varint-encoded persistWatermark. (Missing in v1.) // // (4.5) The modification time of the series file as nanoseconds elapsed since // January 1, 1970 UTC. -1 if the modification time is unknown or no series file // exists yet. (Missing in v1.) // // (4.6) The varint-encoded chunkDescsOffset. // // (4.6) The varint-encoded savedFirstTime. // // (4.7) The varint-encoded number of chunk descriptors. // // (4.8) Repeated once per chunk descriptor, oldest to most recent, either // variant 4.8.1 (if index < persistWatermark) or variant 4.8.2 (if index >= // persistWatermark). In v1, everything is variant 4.8.1 except for a // non-persisted head-chunk (determined by the flags). // // (4.8.1.1) The varint-encoded first time. // (4.8.1.2) The varint-encoded last time. // // (4.8.2.1) A byte defining the chunk type. // (4.8.2.2) The chunk itself, marshaled with the marshal() method. // func (p *persistence) checkpointSeriesMapAndHeads(fingerprintToSeries *seriesMap, fpLocker *fingerprintLocker) (err error) { log.Info("Checkpointing in-memory metrics and chunks...") begin := time.Now() f, err := os.OpenFile(p.headsTempFileName(), os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0640) if err != nil { return } defer func() { f.Sync() closeErr := f.Close() if err != nil { return } err = closeErr if err != nil { return } err = os.Rename(p.headsTempFileName(), p.headsFileName()) duration := time.Since(begin) p.checkpointDuration.Set(float64(duration) / float64(time.Millisecond)) log.Infof("Done checkpointing in-memory metrics and chunks in %v.", duration) }() w := bufio.NewWriterSize(f, fileBufSize) if _, err = w.WriteString(headsMagicString); err != nil { return } var numberOfSeriesOffset int if numberOfSeriesOffset, err = codable.EncodeVarint(w, headsFormatVersion); err != nil { return } numberOfSeriesOffset += len(headsMagicString) numberOfSeriesInHeader := uint64(fingerprintToSeries.length()) // We have to write the number of series as uint64 because we might need // to overwrite it later, and a varint might change byte width then. if err = codable.EncodeUint64(w, numberOfSeriesInHeader); err != nil { return } iter := fingerprintToSeries.iter() defer func() { // Consume the iterator in any case to not leak goroutines. for range iter { } }() var realNumberOfSeries uint64 for m := range iter { func() { // Wrapped in function to use defer for unlocking the fp. fpLocker.Lock(m.fp) defer fpLocker.Unlock(m.fp) if len(m.series.chunkDescs) == 0 { // This series was completely purged or archived in the meantime. Ignore. return } realNumberOfSeries++ // seriesFlags left empty in v2. if err = w.WriteByte(0); err != nil { return } if err = codable.EncodeUint64(w, uint64(m.fp)); err != nil { return } var buf []byte buf, err = codable.Metric(m.series.metric).MarshalBinary() if err != nil { return } w.Write(buf) if _, err = codable.EncodeVarint(w, int64(m.series.persistWatermark)); err != nil { return } if m.series.modTime.IsZero() { if _, err = codable.EncodeVarint(w, -1); err != nil { return } } else { if _, err = codable.EncodeVarint(w, m.series.modTime.UnixNano()); err != nil { return } } if _, err = codable.EncodeVarint(w, int64(m.series.chunkDescsOffset)); err != nil { return } if _, err = codable.EncodeVarint(w, int64(m.series.savedFirstTime)); err != nil { return } if _, err = codable.EncodeVarint(w, int64(len(m.series.chunkDescs))); err != nil { return } for i, chunkDesc := range m.series.chunkDescs { if i < m.series.persistWatermark { if _, err = codable.EncodeVarint(w, int64(chunkDesc.firstTime())); err != nil { return } if _, err = codable.EncodeVarint(w, int64(chunkDesc.lastTime())); err != nil { return } } else { // This is the non-persisted head chunk. Fully marshal it. if err = w.WriteByte(byte(chunkDesc.c.encoding())); err != nil { return } if err = chunkDesc.c.marshal(w); err != nil { return } } } // Series is checkpointed now, so declare it clean. m.series.dirty = false }() if err != nil { return } } if err = w.Flush(); err != nil { return } if realNumberOfSeries != numberOfSeriesInHeader { // The number of series has changed in the meantime. // Rewrite it in the header. if _, err = f.Seek(int64(numberOfSeriesOffset), os.SEEK_SET); err != nil { return } if err = codable.EncodeUint64(f, realNumberOfSeries); err != nil { return } } return }