func (h Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { ctx := route.Context(r) name := strings.Trim(route.Param(ctx, "filepath"), "/") if name == "" { name = "index.html" } file, err := GetFile(StaticFiles, name) if err != nil { if err != io.EOF { log.Warn("Could not get file: ", err) } w.WriteHeader(http.StatusNotFound) return } contentType := http.DetectContentType(file) if strings.Contains(contentType, "text/plain") || strings.Contains(contentType, "application/octet-stream") { parts := strings.Split(name, ".") contentType = mimeMap[parts[len(parts)-1]] } w.Header().Set("Content-Type", contentType) w.Header().Set("Cache-Control", "public, max-age=259200") w.Write(file) }
// Append implements Storage. func (s *memorySeriesStorage) Append(sample *clientmodel.Sample) { if s.getNumChunksToPersist() >= s.maxChunksToPersist { log.Warnf( "%d chunks waiting for persistence, sample ingestion suspended.", s.getNumChunksToPersist(), ) for s.getNumChunksToPersist() >= s.maxChunksToPersist { time.Sleep(time.Second) } log.Warn("Sample ingestion resumed.") } rawFP := sample.Metric.FastFingerprint() s.fpLocker.Lock(rawFP) fp, err := s.mapper.mapFP(rawFP, sample.Metric) if err != nil { log.Errorf("Error while mapping fingerprint %v: %v", rawFP, err) s.persistence.setDirty(true) } if fp != rawFP { // Switch locks. s.fpLocker.Unlock(rawFP) s.fpLocker.Lock(fp) } series := s.getOrCreateSeries(fp, sample.Metric) completedChunksCount := series.add(&metric.SamplePair{ Value: sample.Value, Timestamp: sample.Timestamp, }) s.fpLocker.Unlock(fp) s.ingestedSamplesCount.Inc() s.incNumChunksToPersist(completedChunksCount) }
func (n *notifier) handleNotification(a *Alert, op notificationOp, config *pb.NotificationConfig) { for _, pdConfig := range config.PagerdutyConfig { if err := n.sendPagerDutyNotification(pdConfig.GetServiceKey(), op, a); err != nil { log.Errorln("Error sending PagerDuty notification:", err) } } for _, emailConfig := range config.EmailConfig { if op == notificationOpResolve && !emailConfig.GetSendResolved() { continue } if *smtpSmartHost == "" { log.Warn("No SMTP smarthost configured, not sending email notification.") continue } if err := n.sendEmailNotification(emailConfig.GetEmail(), op, a); err != nil { log.Errorln("Error sending email notification:", err) } } for _, poConfig := range config.PushoverConfig { if op == notificationOpResolve && !poConfig.GetSendResolved() { continue } if err := n.sendPushoverNotification(poConfig.GetToken(), op, poConfig.GetUserKey(), a); err != nil { log.Errorln("Error sending Pushover notification:", err) } } for _, hcConfig := range config.HipchatConfig { if op == notificationOpResolve && !hcConfig.GetSendResolved() { continue } if err := n.sendHipChatNotification(op, hcConfig, a); err != nil { log.Errorln("Error sending HipChat notification:", err) } } for _, scConfig := range config.SlackConfig { if op == notificationOpResolve && !scConfig.GetSendResolved() { continue } if err := n.sendSlackNotification(op, scConfig, a); err != nil { log.Errorln("Error sending Slack notification:", err) } } for _, fdConfig := range config.FlowdockConfig { if op == notificationOpResolve && !fdConfig.GetSendResolved() { continue } if err := n.sendFlowdockNotification(op, fdConfig, a); err != nil { log.Errorln("Error sending Flowdock notification:", err) } } for _, whConfig := range config.WebhookConfig { if op == notificationOpResolve && !whConfig.GetSendResolved() { continue } if err := n.sendWebhookNotification(op, whConfig, a); err != nil { log.Errorln("Error sending Webhook notification:", err) } } }
// Append queues a sample to be sent to the remote storage. It drops the // sample on the floor if the queue is full. It implements // storage.SampleAppender. func (t *StorageQueueManager) Append(s *clientmodel.Sample) { select { case t.queue <- s: default: t.samplesCount.WithLabelValues(dropped).Inc() log.Warn("Remote storage queue full, discarding sample.") } }
// Append implements Storage. func (s *memorySeriesStorage) Append(sample *model.Sample) { for ln, lv := range sample.Metric { if len(lv) == 0 { delete(sample.Metric, ln) } } if s.getNumChunksToPersist() >= s.maxChunksToPersist { log.Warnf( "%d chunks waiting for persistence, sample ingestion suspended.", s.getNumChunksToPersist(), ) for s.getNumChunksToPersist() >= s.maxChunksToPersist { time.Sleep(time.Second) } log.Warn("Sample ingestion resumed.") } rawFP := sample.Metric.FastFingerprint() s.fpLocker.Lock(rawFP) fp, err := s.mapper.mapFP(rawFP, sample.Metric) if err != nil { log.Errorf("Error while mapping fingerprint %v: %v", rawFP, err) s.persistence.setDirty(true) } if fp != rawFP { // Switch locks. s.fpLocker.Unlock(rawFP) s.fpLocker.Lock(fp) } series := s.getOrCreateSeries(fp, sample.Metric) if sample.Timestamp <= series.lastTime { // Don't log and track equal timestamps, as they are a common occurrence // when using client-side timestamps (e.g. Pushgateway or federation). // It would be even better to also compare the sample values here, but // we don't have efficient access to a series's last value. if sample.Timestamp != series.lastTime { log.Warnf("Ignoring sample with out-of-order timestamp for fingerprint %v (%v): %v is not after %v", fp, series.metric, sample.Timestamp, series.lastTime) s.outOfOrderSamplesCount.Inc() } s.fpLocker.Unlock(fp) return } completedChunksCount := series.add(&model.SamplePair{ Value: sample.Value, Timestamp: sample.Timestamp, }) s.fpLocker.Unlock(fp) s.ingestedSamplesCount.Inc() s.incNumChunksToPersist(completedChunksCount) }
// isDegraded returns whether the storage is in "graceful degradation mode", // which is the case if the number of chunks waiting for persistence has reached // a percentage of maxChunksToPersist that exceeds // percentChunksToPersistForDegradation. The method is not goroutine safe (but // only ever called from the goroutine dealing with series maintenance). // Changes of degradation mode are logged. func (s *memorySeriesStorage) isDegraded() bool { nowDegraded := s.getNumChunksToPersist() > s.maxChunksToPersist*percentChunksToPersistForDegradation/100 if s.degraded && !nowDegraded { log.Warn("Storage has left graceful degradation mode. Things are back to normal.") } else if !s.degraded && nowDegraded { log.Warnf( "%d chunks waiting for persistence (%d%% of the allowed maximum %d). Storage is now in graceful degradation mode. Series files are not synced anymore if following the adaptive strategy. Checkpoints are not performed more often than every %v. Series maintenance happens as frequently as possible.", s.getNumChunksToPersist(), s.getNumChunksToPersist()*100/s.maxChunksToPersist, s.maxChunksToPersist, s.checkpointInterval) } s.degraded = nowDegraded return s.degraded }
// Append implements Storage. func (s *memorySeriesStorage) Append(sample *clientmodel.Sample) { for ln, lv := range sample.Metric { if len(lv) == 0 { delete(sample.Metric, ln) } } if s.getNumChunksToPersist() >= s.maxChunksToPersist { log.Warnf( "%d chunks waiting for persistence, sample ingestion suspended.", s.getNumChunksToPersist(), ) for s.getNumChunksToPersist() >= s.maxChunksToPersist { time.Sleep(time.Second) } log.Warn("Sample ingestion resumed.") } rawFP := sample.Metric.FastFingerprint() s.fpLocker.Lock(rawFP) fp, err := s.mapper.mapFP(rawFP, sample.Metric) if err != nil { log.Errorf("Error while mapping fingerprint %v: %v", rawFP, err) s.persistence.setDirty(true) } if fp != rawFP { // Switch locks. s.fpLocker.Unlock(rawFP) s.fpLocker.Lock(fp) } series := s.getOrCreateSeries(fp, sample.Metric) if sample.Timestamp <= series.lastTime { s.fpLocker.Unlock(fp) log.Warnf("Ignoring sample with out-of-order timestamp for fingerprint %v (%v): %v is not after %v", fp, series.metric, sample.Timestamp, series.lastTime) s.outOfOrderSamplesCount.Inc() return } completedChunksCount := series.add(&metric.SamplePair{ Value: sample.Value, Timestamp: sample.Timestamp, }) s.fpLocker.Unlock(fp) s.ingestedSamplesCount.Inc() s.incNumChunksToPersist(completedChunksCount) }
// Run dispatches notifications continuously. func (n *NotificationHandler) Run() { for reqs := range n.pendingNotifications { if n.alertmanagerURL == "" { log.Warn("No alert manager configured, not dispatching notification") n.notificationDropped.Inc() continue } begin := time.Now() err := n.sendNotifications(reqs) if err != nil { log.Error("Error sending notification: ", err) n.notificationErrors.Inc() } n.notificationLatency.Observe(float64(time.Since(begin) / time.Millisecond)) } close(n.stopped) }
func (e *periodicExporter) fetch(urlChan <-chan string, metricsChan chan<- prometheus.Metric, wg *sync.WaitGroup) { defer wg.Done() for u := range urlChan { u, err := url.Parse(u) if err != nil { log.Warn("could not parse slave URL: ", err) continue } host, _, err := net.SplitHostPort(u.Host) if err != nil { log.Warn("could not parse network address: ", err) continue } monitorURL := fmt.Sprintf("%s/monitor/statistics.json", u) resp, err := httpClient.Get(monitorURL) if err != nil { log.Warn(err) e.errors.WithLabelValues(host).Inc() continue } defer resp.Body.Close() var stats []mesos_stats.Monitor if err = json.NewDecoder(resp.Body).Decode(&stats); err != nil { log.Warn("failed to deserialize response: ", err) e.errors.WithLabelValues(host).Inc() continue } for _, stat := range stats { metricsChan <- prometheus.MustNewConstMetric( cpuLimitDesc, prometheus.GaugeValue, float64(stat.Statistics.CpusLimit), stat.Source, host, stat.FrameworkId, ) metricsChan <- prometheus.MustNewConstMetric( cpuSysDesc, prometheus.CounterValue, float64(stat.Statistics.CpusSystemTimeSecs), stat.Source, host, stat.FrameworkId, ) metricsChan <- prometheus.MustNewConstMetric( cpuUsrDesc, prometheus.CounterValue, float64(stat.Statistics.CpusUserTimeSecs), stat.Source, host, stat.FrameworkId, ) metricsChan <- prometheus.MustNewConstMetric( memLimitDesc, prometheus.GaugeValue, float64(stat.Statistics.MemLimitBytes), stat.Source, host, stat.FrameworkId, ) metricsChan <- prometheus.MustNewConstMetric( memRssDesc, prometheus.GaugeValue, float64(stat.Statistics.MemRssBytes), stat.Source, host, stat.FrameworkId, ) } } }
func (e *periodicExporter) updateSlaves() { log.Debug("discovering slaves...") // This will redirect us to the elected mesos master redirectURL := fmt.Sprintf("%s://%s/master/redirect", e.queryURL.Scheme, e.queryURL.Host) rReq, err := http.NewRequest("GET", redirectURL, nil) if err != nil { panic(err) } tr := http.Transport{ DisableKeepAlives: true, } rresp, err := tr.RoundTrip(rReq) if err != nil { log.Warn(err) return } defer rresp.Body.Close() // This will/should return http://master.ip:5050 masterLoc := rresp.Header.Get("Location") if masterLoc == "" { log.Warnf("%d response missing Location header", rresp.StatusCode) return } log.Debugf("current elected master at: %s", masterLoc) // Starting from 0.23.0, a Mesos Master does not set the scheme in the "Location" header. // Use the scheme from the master URL in this case. var stateURL string if strings.HasPrefix(masterLoc, "http") { stateURL = fmt.Sprintf("%s/master/state.json", masterLoc) } else { stateURL = fmt.Sprintf("%s:%s/master/state.json", e.queryURL.Scheme, masterLoc) } var state masterState // Find all active slaves err = getJSON(&state, stateURL) if err != nil { log.Warn(err) return } var slaveURLs []string for _, slave := range state.Slaves { if slave.Active { // Extract slave port from pid _, port, err := net.SplitHostPort(slave.PID) if err != nil { port = "5051" } url := fmt.Sprintf("http://%s:%s", slave.Hostname, port) slaveURLs = append(slaveURLs, url) } } log.Debugf("%d slaves discovered", len(slaveURLs)) e.slaves.Lock() e.slaves.urls = slaveURLs e.slaves.Unlock() }
func (e *periodicExporter) updateSlaves() { log.Debug("discovering slaves...") // This will redirect us to the elected mesos master redirectURL := fmt.Sprintf("%s/master/redirect", e.opts.masterURL) rReq, err := http.NewRequest("GET", redirectURL, nil) if err != nil { panic(err) } tr := http.Transport{ DisableKeepAlives: true, } rresp, err := tr.RoundTrip(rReq) if err != nil { log.Warn(err) return } defer rresp.Body.Close() // This will/should return http://master.ip:5050 masterLoc := rresp.Header.Get("Location") if masterLoc == "" { log.Warnf("%d response missing Location header", rresp.StatusCode) return } log.Debugf("current elected master at: %s", masterLoc) // Find all active slaves stateURL := fmt.Sprintf("%s/master/state.json", masterLoc) resp, err := http.Get(stateURL) if err != nil { log.Warn(err) return } defer resp.Body.Close() type slave struct { Active bool `json:"active"` Hostname string `json:"hostname"` Pid string `json:"pid"` } var req struct { Slaves []*slave `json:"slaves"` } if err := json.NewDecoder(resp.Body).Decode(&req); err != nil { log.Warnf("failed to deserialize request: %s", err) return } var slaveURLs []string for _, slave := range req.Slaves { if slave.Active { // Extract slave port from pid _, port, err := net.SplitHostPort(slave.Pid) if err != nil { port = "5051" } url := fmt.Sprintf("http://%s:%s", slave.Hostname, port) slaveURLs = append(slaveURLs, url) } } log.Debugf("%d slaves discovered", len(slaveURLs)) e.slaves.Lock() e.slaves.urls = slaveURLs e.slaves.Unlock() }
func (e *periodicExporter) fetch(urlChan <-chan string, metricsChan chan<- prometheus.Metric, wg *sync.WaitGroup) { defer wg.Done() for u := range urlChan { u, err := url.Parse(u) if err != nil { log.Warn("could not parse slave URL: ", err) continue } host, _, err := net.SplitHostPort(u.Host) if err != nil { log.Warn("could not parse network address: ", err) continue } taskInfo := map[string]exporterTaskInfo{} var state slaveState stateURL := fmt.Sprintf("%s/state.json", u) err = getJSON(&state, stateURL) if err != nil { log.Warn(err) e.errors.WithLabelValues(host).Inc() continue } for _, fw := range state.Frameworks { for _, ex := range fw.Executors { for _, t := range ex.Tasks { taskInfo[t.ID] = exporterTaskInfo{fw.Name, t.Name} } } } monitorURL := fmt.Sprintf("%s/monitor/statistics.json", u) var stats []mesos_stats.Monitor err = getJSON(&stats, monitorURL) if err != nil { log.Warn(err) e.errors.WithLabelValues(host).Inc() continue } for _, stat := range stats { tinfo, ok := taskInfo[stat.Source] if !ok { continue } metricsChan <- prometheus.MustNewConstMetric( cpuLimitDesc, prometheus.GaugeValue, float64(stat.Statistics.CpusLimit), stat.Source, host, stat.FrameworkId, tinfo.FrameworkName, tinfo.TaskName, ) metricsChan <- prometheus.MustNewConstMetric( cpuSysDesc, prometheus.CounterValue, float64(stat.Statistics.CpusSystemTimeSecs), stat.Source, host, stat.FrameworkId, tinfo.FrameworkName, tinfo.TaskName, ) metricsChan <- prometheus.MustNewConstMetric( cpuUsrDesc, prometheus.CounterValue, float64(stat.Statistics.CpusUserTimeSecs), stat.Source, host, stat.FrameworkId, tinfo.FrameworkName, tinfo.TaskName, ) metricsChan <- prometheus.MustNewConstMetric( memLimitDesc, prometheus.GaugeValue, float64(stat.Statistics.MemLimitBytes), stat.Source, host, stat.FrameworkId, tinfo.FrameworkName, tinfo.TaskName, ) metricsChan <- prometheus.MustNewConstMetric( memRssDesc, prometheus.GaugeValue, float64(stat.Statistics.MemRssBytes), stat.Source, host, stat.FrameworkId, tinfo.FrameworkName, tinfo.TaskName, ) } metricsSnapshotURL := fmt.Sprintf("%s/metrics/snapshot", u) var ms metricsSnapshot err = getJSON(&ms, metricsSnapshotURL) if err != nil { log.Warn(err) e.errors.WithLabelValues(host).Inc() continue } for _, mm := range slaveMetrics { metricValue, ok := ms[mm.snapshotKey] if !ok { continue } if mm.convertFn != nil { metricValue = mm.convertFn(metricValue) } metricsChan <- prometheus.MustNewConstMetric( mm.desc, mm.valueType, metricValue, host, ) } } }
func (e *periodicExporter) scrapeMaster() { stateURL := fmt.Sprintf("%s://%s/master/state.json", e.queryURL.Scheme, e.queryURL.Host) log.Debugf("Scraping master at %s", stateURL) var state masterState err := getJSON(&state, stateURL) if err != nil { log.Warn(err) return } metrics := []prometheus.Metric{} for _, fw := range state.Frameworks { metrics = append(metrics, prometheus.MustNewConstMetric( frameworkResourcesUsedCPUs, prometheus.GaugeValue, fw.UsedResources.CPUs, fw.ID, fw.Name, )) metrics = append(metrics, prometheus.MustNewConstMetric( frameworkResourcesUsedDisk, prometheus.GaugeValue, megabytesToBytes(fw.UsedResources.Disk), fw.ID, fw.Name, )) metrics = append(metrics, prometheus.MustNewConstMetric( frameworkResourcesUsedMemory, prometheus.GaugeValue, megabytesToBytes(fw.UsedResources.Mem), fw.ID, fw.Name, )) } snapshotURL := fmt.Sprintf("%s://%s/metrics/snapshot", e.queryURL.Scheme, e.queryURL.Host) var ms metricsSnapshot err = getJSON(&ms, snapshotURL) if err != nil { log.Warn(err) return } for _, mm := range masterMetrics { metricValue, ok := ms[mm.snapshotKey] if !ok { continue } if mm.convertFn != nil { metricValue = mm.convertFn(metricValue) } metrics = append(metrics, prometheus.MustNewConstMetric( mm.desc, mm.valueType, metricValue, state.Hostname, )) } e.Lock() e.metrics = metrics e.Unlock() }
// loadSeriesMapAndHeads loads the fingerprint to memory-series mapping and all // the chunks contained in the checkpoint (and thus not yet persisted to series // files). The method is capable of loading the checkpoint format v1 and v2. If // recoverable corruption is detected, or if the dirty flag was set from the // beginning, crash recovery is run, which might take a while. If an // unrecoverable error is encountered, it is returned. Call this method during // start-up while nothing else is running in storage land. This method is // utterly goroutine-unsafe. func (p *persistence) loadSeriesMapAndHeads() (sm *seriesMap, chunksToPersist int64, err error) { var chunkDescsTotal int64 fingerprintToSeries := make(map[clientmodel.Fingerprint]*memorySeries) sm = &seriesMap{m: fingerprintToSeries} defer func() { if sm != nil && p.dirty { log.Warn("Persistence layer appears dirty.") err = p.recoverFromCrash(fingerprintToSeries) if err != nil { sm = nil } } if err == nil { numMemChunkDescs.Add(float64(chunkDescsTotal)) } }() f, err := os.Open(p.headsFileName()) if os.IsNotExist(err) { return sm, 0, nil } if err != nil { log.Warn("Could not open heads file:", err) p.dirty = true return } defer f.Close() r := bufio.NewReaderSize(f, fileBufSize) buf := make([]byte, len(headsMagicString)) if _, err := io.ReadFull(r, buf); err != nil { log.Warn("Could not read from heads file:", err) p.dirty = true return sm, 0, nil } magic := string(buf) if magic != headsMagicString { log.Warnf( "unexpected magic string, want %q, got %q", headsMagicString, magic, ) p.dirty = true return } version, err := binary.ReadVarint(r) if (version != headsFormatVersion && version != headsFormatLegacyVersion) || err != nil { log.Warnf("unknown heads format version, want %d", headsFormatVersion) p.dirty = true return sm, 0, nil } numSeries, err := codable.DecodeUint64(r) if err != nil { log.Warn("Could not decode number of series:", err) p.dirty = true return sm, 0, nil } for ; numSeries > 0; numSeries-- { seriesFlags, err := r.ReadByte() if err != nil { log.Warn("Could not read series flags:", err) p.dirty = true return sm, chunksToPersist, nil } headChunkPersisted := seriesFlags&flagHeadChunkPersisted != 0 fp, err := codable.DecodeUint64(r) if err != nil { log.Warn("Could not decode fingerprint:", err) p.dirty = true return sm, chunksToPersist, nil } var metric codable.Metric if err := metric.UnmarshalFromReader(r); err != nil { log.Warn("Could not decode metric:", err) p.dirty = true return sm, chunksToPersist, nil } var persistWatermark int64 var modTime time.Time if version != headsFormatLegacyVersion { // persistWatermark only present in v2. persistWatermark, err = binary.ReadVarint(r) if err != nil { log.Warn("Could not decode persist watermark:", err) p.dirty = true return sm, chunksToPersist, nil } modTimeNano, err := binary.ReadVarint(r) if err != nil { log.Warn("Could not decode modification time:", err) p.dirty = true return sm, chunksToPersist, nil } if modTimeNano != -1 { modTime = time.Unix(0, modTimeNano) } } chunkDescsOffset, err := binary.ReadVarint(r) if err != nil { log.Warn("Could not decode chunk descriptor offset:", err) p.dirty = true return sm, chunksToPersist, nil } savedFirstTime, err := binary.ReadVarint(r) if err != nil { log.Warn("Could not decode saved first time:", err) p.dirty = true return sm, chunksToPersist, nil } numChunkDescs, err := binary.ReadVarint(r) if err != nil { log.Warn("Could not decode number of chunk descriptors:", err) p.dirty = true return sm, chunksToPersist, nil } chunkDescs := make([]*chunkDesc, numChunkDescs) if version == headsFormatLegacyVersion { if headChunkPersisted { persistWatermark = numChunkDescs } else { persistWatermark = numChunkDescs - 1 } } for i := int64(0); i < numChunkDescs; i++ { if i < persistWatermark { firstTime, err := binary.ReadVarint(r) if err != nil { log.Warn("Could not decode first time:", err) p.dirty = true return sm, chunksToPersist, nil } lastTime, err := binary.ReadVarint(r) if err != nil { log.Warn("Could not decode last time:", err) p.dirty = true return sm, chunksToPersist, nil } chunkDescs[i] = &chunkDesc{ chunkFirstTime: clientmodel.Timestamp(firstTime), chunkLastTime: clientmodel.Timestamp(lastTime), } chunkDescsTotal++ } else { // Non-persisted chunk. encoding, err := r.ReadByte() if err != nil { log.Warn("Could not decode chunk type:", err) p.dirty = true return sm, chunksToPersist, nil } chunk := newChunkForEncoding(chunkEncoding(encoding)) if err := chunk.unmarshal(r); err != nil { log.Warn("Could not decode chunk:", err) p.dirty = true return sm, chunksToPersist, nil } chunkDescs[i] = newChunkDesc(chunk) chunksToPersist++ } } fingerprintToSeries[clientmodel.Fingerprint(fp)] = &memorySeries{ metric: clientmodel.Metric(metric), chunkDescs: chunkDescs, persistWatermark: int(persistWatermark), modTime: modTime, chunkDescsOffset: int(chunkDescsOffset), savedFirstTime: clientmodel.Timestamp(savedFirstTime), headChunkClosed: persistWatermark >= numChunkDescs, } } return sm, chunksToPersist, nil }
func main() { flag.Parse() if !strings.HasPrefix(*pathPrefix, "/") { *pathPrefix = "/" + *pathPrefix } if !strings.HasSuffix(*pathPrefix, "/") { *pathPrefix = *pathPrefix + "/" } versionInfoTmpl.Execute(os.Stdout, BuildInfo) conf := config.MustLoadFromFile(*configFile) silencer := manager.NewSilencer() defer silencer.Close() err := silencer.LoadFromFile(*silencesFile) if err != nil { log.Warn("Couldn't load silences, starting up with empty silence list: ", err) } saveSilencesTicker := time.NewTicker(10 * time.Second) go func() { for range saveSilencesTicker.C { if err := silencer.SaveToFile(*silencesFile); err != nil { log.Error("Error saving silences to file: ", err) } } }() defer saveSilencesTicker.Stop() amURL, err := alertmanagerURL(*hostname, *pathPrefix, *listenAddress, *externalURL) if err != nil { log.Fatalln("Error building Alertmanager URL:", err) } notifier := manager.NewNotifier(conf.NotificationConfig, amURL) defer notifier.Close() inhibitor := new(manager.Inhibitor) inhibitor.SetInhibitRules(conf.InhibitRules()) options := &manager.MemoryAlertManagerOptions{ Inhibitor: inhibitor, Silencer: silencer, Notifier: notifier, MinRefreshInterval: *minRefreshPeriod, } alertManager := manager.NewMemoryAlertManager(options) alertManager.SetAggregationRules(conf.AggregationRules()) go alertManager.Run() // Web initialization. flags := map[string]string{} flag.VisitAll(func(f *flag.Flag) { flags[f.Name] = f.Value.String() }) statusHandler := &web.StatusHandler{ Config: conf.String(), Flags: flags, BuildInfo: BuildInfo, Birth: time.Now(), PathPrefix: *pathPrefix, } webService := &web.WebService{ // REST API Service. AlertManagerService: &api.AlertManagerService{ Manager: alertManager, Silencer: silencer, PathPrefix: *pathPrefix, }, // Template-based page handlers. AlertsHandler: &web.AlertsHandler{ Manager: alertManager, IsSilencedInterrogator: silencer, PathPrefix: *pathPrefix, }, SilencesHandler: &web.SilencesHandler{ Silencer: silencer, PathPrefix: *pathPrefix, }, StatusHandler: statusHandler, } go webService.ServeForever(*listenAddress, *pathPrefix) // React to configuration changes. watcher := config.NewFileWatcher(*configFile) go watcher.Watch(func(conf *config.Config) { inhibitor.SetInhibitRules(conf.InhibitRules()) notifier.SetNotificationConfigs(conf.NotificationConfig) alertManager.SetAggregationRules(conf.AggregationRules()) statusHandler.UpdateConfig(conf.String()) }) log.Info("Running notification dispatcher...") notifier.Dispatch() }
// recoverFromCrash is called by loadSeriesMapAndHeads if the persistence // appears to be dirty after the loading (either because the loading resulted in // an error or because the persistence was dirty from the start). Not goroutine // safe. Only call before anything else is running (except index processing // queue as started by newPersistence). func (p *persistence) recoverFromCrash(fingerprintToSeries map[clientmodel.Fingerprint]*memorySeries) error { // TODO(beorn): We need proper tests for the crash recovery. log.Warn("Starting crash recovery. Prometheus is inoperational until complete.") log.Warn("To avoid crash recovery in future, shutdown Prometheus with SIGTERM or a HTTP POST to /-/quit.") fpsSeen := map[clientmodel.Fingerprint]struct{}{} count := 0 seriesDirNameFmt := fmt.Sprintf("%%0%dx", seriesDirNameLen) // Delete the fingerprint mapping file as it might be stale or // corrupt. We'll rebuild the mappings as we go. os.Remove(p.mappingsFileName()) // The mappings to rebuild. fpm := fpMappings{} log.Info("Scanning files.") for i := 0; i < 1<<(seriesDirNameLen*4); i++ { dirname := path.Join(p.basePath, fmt.Sprintf(seriesDirNameFmt, i)) dir, err := os.Open(dirname) if os.IsNotExist(err) { continue } if err != nil { return err } defer dir.Close() for fis := []os.FileInfo{}; err != io.EOF; fis, err = dir.Readdir(1024) { if err != nil { return err } for _, fi := range fis { fp, ok := p.sanitizeSeries(dirname, fi, fingerprintToSeries, fpm) if ok { fpsSeen[fp] = struct{}{} } count++ if count%10000 == 0 { log.Infof("%d files scanned.", count) } } } } log.Infof("File scan complete. %d series found.", len(fpsSeen)) log.Info("Checking for series without series file.") for fp, s := range fingerprintToSeries { if _, seen := fpsSeen[fp]; !seen { // fp exists in fingerprintToSeries, but has no representation on disk. if s.persistWatermark == len(s.chunkDescs) { // Oops, everything including the head chunk was // already persisted, but nothing on disk. // Thus, we lost that series completely. Clean // up the remnants. delete(fingerprintToSeries, fp) if err := p.purgeArchivedMetric(fp); err != nil { // Purging the archived metric didn't work, so try // to unindex it, just in case it's in the indexes. p.unindexMetric(fp, s.metric) } log.Warnf("Lost series detected: fingerprint %v, metric %v.", fp, s.metric) continue } // If we are here, the only chunks we have are the chunks in the checkpoint. // Adjust things accordingly. if s.persistWatermark > 0 || s.chunkDescsOffset != 0 { minLostChunks := s.persistWatermark + s.chunkDescsOffset if minLostChunks <= 0 { log.Warnf( "Possible loss of chunks for fingerprint %v, metric %v.", fp, s.metric, ) } else { log.Warnf( "Lost at least %d chunks for fingerprint %v, metric %v.", minLostChunks, fp, s.metric, ) } s.chunkDescs = append( make([]*chunkDesc, 0, len(s.chunkDescs)-s.persistWatermark), s.chunkDescs[s.persistWatermark:]..., ) numMemChunkDescs.Sub(float64(s.persistWatermark)) s.persistWatermark = 0 s.chunkDescsOffset = 0 } maybeAddMapping(fp, s.metric, fpm) fpsSeen[fp] = struct{}{} // Add so that fpsSeen is complete. } } log.Info("Check for series without series file complete.") if err := p.cleanUpArchiveIndexes(fingerprintToSeries, fpsSeen, fpm); err != nil { return err } if err := p.rebuildLabelIndexes(fingerprintToSeries); err != nil { return err } // Finally rewrite the mappings file if there are any mappings. if len(fpm) > 0 { if err := p.checkpointFPMappings(fpm); err != nil { return err } } p.setDirty(false) log.Warn("Crash recovery complete.") return nil }
func Main() int { if err := parse(os.Args[1:]); err != nil { return 2 } versionInfoTmpl.Execute(os.Stdout, BuildInfo) if cfg.printVersion { return 0 } memStorage := local.NewMemorySeriesStorage(&cfg.storage) var ( sampleAppender storage.SampleAppender remoteStorageQueues []*remote.StorageQueueManager ) if cfg.opentsdbURL == "" && cfg.influxdbURL == "" { log.Warnf("No remote storage URLs provided; not sending any samples to long-term storage") sampleAppender = memStorage } else { fanout := storage.Fanout{memStorage} addRemoteStorage := func(c remote.StorageClient) { qm := remote.NewStorageQueueManager(c, 100*1024) fanout = append(fanout, qm) remoteStorageQueues = append(remoteStorageQueues, qm) } if cfg.opentsdbURL != "" { addRemoteStorage(opentsdb.NewClient(cfg.opentsdbURL, cfg.remoteStorageTimeout)) } if cfg.influxdbURL != "" { addRemoteStorage(influxdb.NewClient(cfg.influxdbURL, cfg.remoteStorageTimeout, cfg.influxdbDatabase, cfg.influxdbRetentionPolicy)) } sampleAppender = fanout } var ( notificationHandler = notification.NewNotificationHandler(&cfg.notification) targetManager = retrieval.NewTargetManager(sampleAppender) queryEngine = promql.NewEngine(memStorage, &cfg.queryEngine) ) ruleManager := rules.NewManager(&rules.ManagerOptions{ SampleAppender: sampleAppender, NotificationHandler: notificationHandler, QueryEngine: queryEngine, PrometheusURL: cfg.prometheusURL, PathPrefix: cfg.web.PathPrefix, }) flags := map[string]string{} cfg.fs.VisitAll(func(f *flag.Flag) { flags[f.Name] = f.Value.String() }) status := &web.PrometheusStatus{ BuildInfo: BuildInfo, TargetPools: targetManager.Pools, Rules: ruleManager.Rules, Flags: flags, Birth: time.Now(), } webHandler := web.New(memStorage, queryEngine, ruleManager, status, &cfg.web) if !reloadConfig(cfg.configFile, status, targetManager, ruleManager) { os.Exit(1) } // Wait for reload or termination signals. Start the handler for SIGHUP as // early as possible, but ignore it until we are ready to handle reloading // our config. hup := make(chan os.Signal) hupReady := make(chan bool) signal.Notify(hup, syscall.SIGHUP) go func() { <-hupReady for range hup { reloadConfig(cfg.configFile, status, targetManager, ruleManager) } }() // Start all components. if err := memStorage.Start(); err != nil { log.Errorln("Error opening memory series storage:", err) return 1 } defer func() { if err := memStorage.Stop(); err != nil { log.Errorln("Error stopping storage:", err) } }() // The storage has to be fully initialized before registering. registry.MustRegister(memStorage) registry.MustRegister(notificationHandler) for _, q := range remoteStorageQueues { registry.MustRegister(q) go q.Run() defer q.Stop() } go ruleManager.Run() defer ruleManager.Stop() go notificationHandler.Run() defer notificationHandler.Stop() go targetManager.Run() defer targetManager.Stop() defer queryEngine.Stop() go webHandler.Run() // Wait for reload or termination signals. close(hupReady) // Unblock SIGHUP handler. term := make(chan os.Signal) signal.Notify(term, os.Interrupt, syscall.SIGTERM) select { case <-term: log.Warn("Received SIGTERM, exiting gracefully...") case <-webHandler.Quit(): log.Warn("Received termination request via web service, exiting gracefully...") } close(hup) log.Info("See you next time!") return 0 }
// Main manages the startup and shutdown lifecycle of the entire Prometheus server. func Main() int { if err := parse(os.Args[1:]); err != nil { return 2 } printVersion() if cfg.printVersion { return 0 } var reloadables []Reloadable var ( memStorage = local.NewMemorySeriesStorage(&cfg.storage) remoteStorage = remote.New(&cfg.remote) sampleAppender = storage.Fanout{memStorage} ) if remoteStorage != nil { sampleAppender = append(sampleAppender, remoteStorage) reloadables = append(reloadables, remoteStorage) } var ( notificationHandler = notification.NewNotificationHandler(&cfg.notification) targetManager = retrieval.NewTargetManager(sampleAppender) queryEngine = promql.NewEngine(memStorage, &cfg.queryEngine) ) ruleManager := rules.NewManager(&rules.ManagerOptions{ SampleAppender: sampleAppender, NotificationHandler: notificationHandler, QueryEngine: queryEngine, ExternalURL: cfg.web.ExternalURL, }) flags := map[string]string{} cfg.fs.VisitAll(func(f *flag.Flag) { flags[f.Name] = f.Value.String() }) status := &web.PrometheusStatus{ TargetPools: targetManager.Pools, Rules: ruleManager.Rules, Flags: flags, Birth: time.Now(), } webHandler := web.New(memStorage, queryEngine, ruleManager, status, &cfg.web) reloadables = append(reloadables, status, targetManager, ruleManager, webHandler, notificationHandler) if !reloadConfig(cfg.configFile, reloadables...) { return 1 } // Wait for reload or termination signals. Start the handler for SIGHUP as // early as possible, but ignore it until we are ready to handle reloading // our config. hup := make(chan os.Signal) hupReady := make(chan bool) signal.Notify(hup, syscall.SIGHUP) go func() { <-hupReady for { select { case <-hup: case <-webHandler.Reload(): } reloadConfig(cfg.configFile, reloadables...) } }() // Start all components. if err := memStorage.Start(); err != nil { log.Errorln("Error opening memory series storage:", err) return 1 } defer func() { if err := memStorage.Stop(); err != nil { log.Errorln("Error stopping storage:", err) } }() if remoteStorage != nil { prometheus.MustRegister(remoteStorage) go remoteStorage.Run() defer remoteStorage.Stop() } // The storage has to be fully initialized before registering. prometheus.MustRegister(memStorage) prometheus.MustRegister(notificationHandler) prometheus.MustRegister(configSuccess) prometheus.MustRegister(configSuccessTime) go ruleManager.Run() defer ruleManager.Stop() go notificationHandler.Run() defer notificationHandler.Stop() go targetManager.Run() defer targetManager.Stop() defer queryEngine.Stop() go webHandler.Run() // Wait for reload or termination signals. close(hupReady) // Unblock SIGHUP handler. term := make(chan os.Signal) signal.Notify(term, os.Interrupt, syscall.SIGTERM) select { case <-term: log.Warn("Received SIGTERM, exiting gracefully...") case <-webHandler.Quit(): log.Warn("Received termination request via web service, exiting gracefully...") case err := <-webHandler.ListenError(): log.Errorln("Error starting web server, exiting gracefully:", err) } log.Info("See you next time!") return 0 }