Пример #1
0
func (h Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
	ctx := route.Context(r)

	name := strings.Trim(route.Param(ctx, "filepath"), "/")
	if name == "" {
		name = "index.html"
	}

	file, err := GetFile(StaticFiles, name)
	if err != nil {
		if err != io.EOF {
			log.Warn("Could not get file: ", err)
		}
		w.WriteHeader(http.StatusNotFound)
		return
	}
	contentType := http.DetectContentType(file)
	if strings.Contains(contentType, "text/plain") || strings.Contains(contentType, "application/octet-stream") {
		parts := strings.Split(name, ".")
		contentType = mimeMap[parts[len(parts)-1]]
	}
	w.Header().Set("Content-Type", contentType)
	w.Header().Set("Cache-Control", "public, max-age=259200")
	w.Write(file)
}
Пример #2
0
// Append implements Storage.
func (s *memorySeriesStorage) Append(sample *clientmodel.Sample) {
	if s.getNumChunksToPersist() >= s.maxChunksToPersist {
		log.Warnf(
			"%d chunks waiting for persistence, sample ingestion suspended.",
			s.getNumChunksToPersist(),
		)
		for s.getNumChunksToPersist() >= s.maxChunksToPersist {
			time.Sleep(time.Second)
		}
		log.Warn("Sample ingestion resumed.")
	}
	rawFP := sample.Metric.FastFingerprint()
	s.fpLocker.Lock(rawFP)
	fp, err := s.mapper.mapFP(rawFP, sample.Metric)
	if err != nil {
		log.Errorf("Error while mapping fingerprint %v: %v", rawFP, err)
		s.persistence.setDirty(true)
	}
	if fp != rawFP {
		// Switch locks.
		s.fpLocker.Unlock(rawFP)
		s.fpLocker.Lock(fp)
	}
	series := s.getOrCreateSeries(fp, sample.Metric)
	completedChunksCount := series.add(&metric.SamplePair{
		Value:     sample.Value,
		Timestamp: sample.Timestamp,
	})
	s.fpLocker.Unlock(fp)
	s.ingestedSamplesCount.Inc()
	s.incNumChunksToPersist(completedChunksCount)
}
Пример #3
0
func (n *notifier) handleNotification(a *Alert, op notificationOp, config *pb.NotificationConfig) {
	for _, pdConfig := range config.PagerdutyConfig {
		if err := n.sendPagerDutyNotification(pdConfig.GetServiceKey(), op, a); err != nil {
			log.Errorln("Error sending PagerDuty notification:", err)
		}
	}
	for _, emailConfig := range config.EmailConfig {
		if op == notificationOpResolve && !emailConfig.GetSendResolved() {
			continue
		}
		if *smtpSmartHost == "" {
			log.Warn("No SMTP smarthost configured, not sending email notification.")
			continue
		}
		if err := n.sendEmailNotification(emailConfig.GetEmail(), op, a); err != nil {
			log.Errorln("Error sending email notification:", err)
		}
	}
	for _, poConfig := range config.PushoverConfig {
		if op == notificationOpResolve && !poConfig.GetSendResolved() {
			continue
		}
		if err := n.sendPushoverNotification(poConfig.GetToken(), op, poConfig.GetUserKey(), a); err != nil {
			log.Errorln("Error sending Pushover notification:", err)
		}
	}
	for _, hcConfig := range config.HipchatConfig {
		if op == notificationOpResolve && !hcConfig.GetSendResolved() {
			continue
		}
		if err := n.sendHipChatNotification(op, hcConfig, a); err != nil {
			log.Errorln("Error sending HipChat notification:", err)
		}
	}
	for _, scConfig := range config.SlackConfig {
		if op == notificationOpResolve && !scConfig.GetSendResolved() {
			continue
		}
		if err := n.sendSlackNotification(op, scConfig, a); err != nil {
			log.Errorln("Error sending Slack notification:", err)
		}
	}
	for _, fdConfig := range config.FlowdockConfig {
		if op == notificationOpResolve && !fdConfig.GetSendResolved() {
			continue
		}
		if err := n.sendFlowdockNotification(op, fdConfig, a); err != nil {
			log.Errorln("Error sending Flowdock notification:", err)
		}
	}
	for _, whConfig := range config.WebhookConfig {
		if op == notificationOpResolve && !whConfig.GetSendResolved() {
			continue
		}
		if err := n.sendWebhookNotification(op, whConfig, a); err != nil {
			log.Errorln("Error sending Webhook notification:", err)
		}
	}
}
Пример #4
0
// Append queues a sample to be sent to the remote storage. It drops the
// sample on the floor if the queue is full. It implements
// storage.SampleAppender.
func (t *StorageQueueManager) Append(s *clientmodel.Sample) {
	select {
	case t.queue <- s:
	default:
		t.samplesCount.WithLabelValues(dropped).Inc()
		log.Warn("Remote storage queue full, discarding sample.")
	}
}
Пример #5
0
// Append implements Storage.
func (s *memorySeriesStorage) Append(sample *model.Sample) {
	for ln, lv := range sample.Metric {
		if len(lv) == 0 {
			delete(sample.Metric, ln)
		}
	}
	if s.getNumChunksToPersist() >= s.maxChunksToPersist {
		log.Warnf(
			"%d chunks waiting for persistence, sample ingestion suspended.",
			s.getNumChunksToPersist(),
		)
		for s.getNumChunksToPersist() >= s.maxChunksToPersist {
			time.Sleep(time.Second)
		}
		log.Warn("Sample ingestion resumed.")
	}
	rawFP := sample.Metric.FastFingerprint()
	s.fpLocker.Lock(rawFP)
	fp, err := s.mapper.mapFP(rawFP, sample.Metric)
	if err != nil {
		log.Errorf("Error while mapping fingerprint %v: %v", rawFP, err)
		s.persistence.setDirty(true)
	}
	if fp != rawFP {
		// Switch locks.
		s.fpLocker.Unlock(rawFP)
		s.fpLocker.Lock(fp)
	}
	series := s.getOrCreateSeries(fp, sample.Metric)

	if sample.Timestamp <= series.lastTime {
		// Don't log and track equal timestamps, as they are a common occurrence
		// when using client-side timestamps (e.g. Pushgateway or federation).
		// It would be even better to also compare the sample values here, but
		// we don't have efficient access to a series's last value.
		if sample.Timestamp != series.lastTime {
			log.Warnf("Ignoring sample with out-of-order timestamp for fingerprint %v (%v): %v is not after %v", fp, series.metric, sample.Timestamp, series.lastTime)
			s.outOfOrderSamplesCount.Inc()
		}
		s.fpLocker.Unlock(fp)
		return
	}
	completedChunksCount := series.add(&model.SamplePair{
		Value:     sample.Value,
		Timestamp: sample.Timestamp,
	})
	s.fpLocker.Unlock(fp)
	s.ingestedSamplesCount.Inc()
	s.incNumChunksToPersist(completedChunksCount)
}
Пример #6
0
// isDegraded returns whether the storage is in "graceful degradation mode",
// which is the case if the number of chunks waiting for persistence has reached
// a percentage of maxChunksToPersist that exceeds
// percentChunksToPersistForDegradation. The method is not goroutine safe (but
// only ever called from the goroutine dealing with series maintenance).
// Changes of degradation mode are logged.
func (s *memorySeriesStorage) isDegraded() bool {
	nowDegraded := s.getNumChunksToPersist() > s.maxChunksToPersist*percentChunksToPersistForDegradation/100
	if s.degraded && !nowDegraded {
		log.Warn("Storage has left graceful degradation mode. Things are back to normal.")
	} else if !s.degraded && nowDegraded {
		log.Warnf(
			"%d chunks waiting for persistence (%d%% of the allowed maximum %d). Storage is now in graceful degradation mode. Series files are not synced anymore if following the adaptive strategy. Checkpoints are not performed more often than every %v. Series maintenance happens as frequently as possible.",
			s.getNumChunksToPersist(),
			s.getNumChunksToPersist()*100/s.maxChunksToPersist,
			s.maxChunksToPersist,
			s.checkpointInterval)
	}
	s.degraded = nowDegraded
	return s.degraded
}
Пример #7
0
// Append implements Storage.
func (s *memorySeriesStorage) Append(sample *clientmodel.Sample) {
	for ln, lv := range sample.Metric {
		if len(lv) == 0 {
			delete(sample.Metric, ln)
		}
	}
	if s.getNumChunksToPersist() >= s.maxChunksToPersist {
		log.Warnf(
			"%d chunks waiting for persistence, sample ingestion suspended.",
			s.getNumChunksToPersist(),
		)
		for s.getNumChunksToPersist() >= s.maxChunksToPersist {
			time.Sleep(time.Second)
		}
		log.Warn("Sample ingestion resumed.")
	}
	rawFP := sample.Metric.FastFingerprint()
	s.fpLocker.Lock(rawFP)
	fp, err := s.mapper.mapFP(rawFP, sample.Metric)
	if err != nil {
		log.Errorf("Error while mapping fingerprint %v: %v", rawFP, err)
		s.persistence.setDirty(true)
	}
	if fp != rawFP {
		// Switch locks.
		s.fpLocker.Unlock(rawFP)
		s.fpLocker.Lock(fp)
	}
	series := s.getOrCreateSeries(fp, sample.Metric)

	if sample.Timestamp <= series.lastTime {
		s.fpLocker.Unlock(fp)
		log.Warnf("Ignoring sample with out-of-order timestamp for fingerprint %v (%v): %v is not after %v", fp, series.metric, sample.Timestamp, series.lastTime)
		s.outOfOrderSamplesCount.Inc()
		return
	}
	completedChunksCount := series.add(&metric.SamplePair{
		Value:     sample.Value,
		Timestamp: sample.Timestamp,
	})
	s.fpLocker.Unlock(fp)
	s.ingestedSamplesCount.Inc()
	s.incNumChunksToPersist(completedChunksCount)
}
Пример #8
0
// Run dispatches notifications continuously.
func (n *NotificationHandler) Run() {
	for reqs := range n.pendingNotifications {
		if n.alertmanagerURL == "" {
			log.Warn("No alert manager configured, not dispatching notification")
			n.notificationDropped.Inc()
			continue
		}

		begin := time.Now()
		err := n.sendNotifications(reqs)

		if err != nil {
			log.Error("Error sending notification: ", err)
			n.notificationErrors.Inc()
		}

		n.notificationLatency.Observe(float64(time.Since(begin) / time.Millisecond))
	}
	close(n.stopped)
}
Пример #9
0
func (e *periodicExporter) fetch(urlChan <-chan string, metricsChan chan<- prometheus.Metric, wg *sync.WaitGroup) {
	defer wg.Done()

	for u := range urlChan {
		u, err := url.Parse(u)
		if err != nil {
			log.Warn("could not parse slave URL: ", err)
			continue
		}

		host, _, err := net.SplitHostPort(u.Host)
		if err != nil {
			log.Warn("could not parse network address: ", err)
			continue
		}

		monitorURL := fmt.Sprintf("%s/monitor/statistics.json", u)
		resp, err := httpClient.Get(monitorURL)
		if err != nil {
			log.Warn(err)
			e.errors.WithLabelValues(host).Inc()
			continue
		}
		defer resp.Body.Close()

		var stats []mesos_stats.Monitor
		if err = json.NewDecoder(resp.Body).Decode(&stats); err != nil {
			log.Warn("failed to deserialize response: ", err)
			e.errors.WithLabelValues(host).Inc()
			continue
		}

		for _, stat := range stats {
			metricsChan <- prometheus.MustNewConstMetric(
				cpuLimitDesc,
				prometheus.GaugeValue,
				float64(stat.Statistics.CpusLimit),
				stat.Source, host, stat.FrameworkId,
			)
			metricsChan <- prometheus.MustNewConstMetric(
				cpuSysDesc,
				prometheus.CounterValue,
				float64(stat.Statistics.CpusSystemTimeSecs),
				stat.Source, host, stat.FrameworkId,
			)
			metricsChan <- prometheus.MustNewConstMetric(
				cpuUsrDesc,
				prometheus.CounterValue,
				float64(stat.Statistics.CpusUserTimeSecs),
				stat.Source, host, stat.FrameworkId,
			)
			metricsChan <- prometheus.MustNewConstMetric(
				memLimitDesc,
				prometheus.GaugeValue,
				float64(stat.Statistics.MemLimitBytes),
				stat.Source, host, stat.FrameworkId,
			)
			metricsChan <- prometheus.MustNewConstMetric(
				memRssDesc,
				prometheus.GaugeValue,
				float64(stat.Statistics.MemRssBytes),
				stat.Source, host, stat.FrameworkId,
			)
		}
	}
}
Пример #10
0
func (e *periodicExporter) updateSlaves() {
	log.Debug("discovering slaves...")

	// This will redirect us to the elected mesos master
	redirectURL := fmt.Sprintf("%s://%s/master/redirect", e.queryURL.Scheme, e.queryURL.Host)
	rReq, err := http.NewRequest("GET", redirectURL, nil)
	if err != nil {
		panic(err)
	}

	tr := http.Transport{
		DisableKeepAlives: true,
	}
	rresp, err := tr.RoundTrip(rReq)
	if err != nil {
		log.Warn(err)
		return
	}
	defer rresp.Body.Close()

	// This will/should return http://master.ip:5050
	masterLoc := rresp.Header.Get("Location")
	if masterLoc == "" {
		log.Warnf("%d response missing Location header", rresp.StatusCode)
		return
	}

	log.Debugf("current elected master at: %s", masterLoc)

	// Starting from 0.23.0, a Mesos Master does not set the scheme in the "Location" header.
	// Use the scheme from the master URL in this case.
	var stateURL string
	if strings.HasPrefix(masterLoc, "http") {
		stateURL = fmt.Sprintf("%s/master/state.json", masterLoc)
	} else {
		stateURL = fmt.Sprintf("%s:%s/master/state.json", e.queryURL.Scheme, masterLoc)
	}

	var state masterState

	// Find all active slaves
	err = getJSON(&state, stateURL)
	if err != nil {
		log.Warn(err)
		return
	}

	var slaveURLs []string
	for _, slave := range state.Slaves {
		if slave.Active {
			// Extract slave port from pid
			_, port, err := net.SplitHostPort(slave.PID)
			if err != nil {
				port = "5051"
			}
			url := fmt.Sprintf("http://%s:%s", slave.Hostname, port)

			slaveURLs = append(slaveURLs, url)
		}
	}

	log.Debugf("%d slaves discovered", len(slaveURLs))

	e.slaves.Lock()
	e.slaves.urls = slaveURLs
	e.slaves.Unlock()
}
Пример #11
0
func (e *periodicExporter) updateSlaves() {
	log.Debug("discovering slaves...")

	// This will redirect us to the elected mesos master
	redirectURL := fmt.Sprintf("%s/master/redirect", e.opts.masterURL)
	rReq, err := http.NewRequest("GET", redirectURL, nil)
	if err != nil {
		panic(err)
	}

	tr := http.Transport{
		DisableKeepAlives: true,
	}
	rresp, err := tr.RoundTrip(rReq)
	if err != nil {
		log.Warn(err)
		return
	}
	defer rresp.Body.Close()

	// This will/should return http://master.ip:5050
	masterLoc := rresp.Header.Get("Location")
	if masterLoc == "" {
		log.Warnf("%d response missing Location header", rresp.StatusCode)
		return
	}

	log.Debugf("current elected master at: %s", masterLoc)

	// Find all active slaves
	stateURL := fmt.Sprintf("%s/master/state.json", masterLoc)
	resp, err := http.Get(stateURL)
	if err != nil {
		log.Warn(err)
		return
	}
	defer resp.Body.Close()

	type slave struct {
		Active   bool   `json:"active"`
		Hostname string `json:"hostname"`
		Pid      string `json:"pid"`
	}

	var req struct {
		Slaves []*slave `json:"slaves"`
	}

	if err := json.NewDecoder(resp.Body).Decode(&req); err != nil {
		log.Warnf("failed to deserialize request: %s", err)
		return
	}

	var slaveURLs []string
	for _, slave := range req.Slaves {
		if slave.Active {
			// Extract slave port from pid
			_, port, err := net.SplitHostPort(slave.Pid)
			if err != nil {
				port = "5051"
			}
			url := fmt.Sprintf("http://%s:%s", slave.Hostname, port)

			slaveURLs = append(slaveURLs, url)
		}
	}

	log.Debugf("%d slaves discovered", len(slaveURLs))

	e.slaves.Lock()
	e.slaves.urls = slaveURLs
	e.slaves.Unlock()
}
Пример #12
0
func (e *periodicExporter) fetch(urlChan <-chan string, metricsChan chan<- prometheus.Metric, wg *sync.WaitGroup) {
	defer wg.Done()

	for u := range urlChan {
		u, err := url.Parse(u)
		if err != nil {
			log.Warn("could not parse slave URL: ", err)
			continue
		}

		host, _, err := net.SplitHostPort(u.Host)
		if err != nil {
			log.Warn("could not parse network address: ", err)
			continue
		}

		taskInfo := map[string]exporterTaskInfo{}
		var state slaveState
		stateURL := fmt.Sprintf("%s/state.json", u)

		err = getJSON(&state, stateURL)
		if err != nil {
			log.Warn(err)
			e.errors.WithLabelValues(host).Inc()
			continue
		}

		for _, fw := range state.Frameworks {
			for _, ex := range fw.Executors {
				for _, t := range ex.Tasks {
					taskInfo[t.ID] = exporterTaskInfo{fw.Name, t.Name}
				}
			}
		}

		monitorURL := fmt.Sprintf("%s/monitor/statistics.json", u)
		var stats []mesos_stats.Monitor

		err = getJSON(&stats, monitorURL)
		if err != nil {
			log.Warn(err)
			e.errors.WithLabelValues(host).Inc()
			continue
		}

		for _, stat := range stats {
			tinfo, ok := taskInfo[stat.Source]
			if !ok {
				continue
			}

			metricsChan <- prometheus.MustNewConstMetric(
				cpuLimitDesc,
				prometheus.GaugeValue,
				float64(stat.Statistics.CpusLimit),
				stat.Source, host, stat.FrameworkId, tinfo.FrameworkName, tinfo.TaskName,
			)
			metricsChan <- prometheus.MustNewConstMetric(
				cpuSysDesc,
				prometheus.CounterValue,
				float64(stat.Statistics.CpusSystemTimeSecs),
				stat.Source, host, stat.FrameworkId, tinfo.FrameworkName, tinfo.TaskName,
			)
			metricsChan <- prometheus.MustNewConstMetric(
				cpuUsrDesc,
				prometheus.CounterValue,
				float64(stat.Statistics.CpusUserTimeSecs),
				stat.Source, host, stat.FrameworkId, tinfo.FrameworkName, tinfo.TaskName,
			)
			metricsChan <- prometheus.MustNewConstMetric(
				memLimitDesc,
				prometheus.GaugeValue,
				float64(stat.Statistics.MemLimitBytes),
				stat.Source, host, stat.FrameworkId, tinfo.FrameworkName, tinfo.TaskName,
			)
			metricsChan <- prometheus.MustNewConstMetric(
				memRssDesc,
				prometheus.GaugeValue,
				float64(stat.Statistics.MemRssBytes),
				stat.Source, host, stat.FrameworkId, tinfo.FrameworkName, tinfo.TaskName,
			)
		}

		metricsSnapshotURL := fmt.Sprintf("%s/metrics/snapshot", u)
		var ms metricsSnapshot

		err = getJSON(&ms, metricsSnapshotURL)
		if err != nil {
			log.Warn(err)
			e.errors.WithLabelValues(host).Inc()
			continue
		}

		for _, mm := range slaveMetrics {
			metricValue, ok := ms[mm.snapshotKey]
			if !ok {
				continue
			}

			if mm.convertFn != nil {
				metricValue = mm.convertFn(metricValue)
			}

			metricsChan <- prometheus.MustNewConstMetric(
				mm.desc, mm.valueType, metricValue, host,
			)
		}
	}
}
Пример #13
0
func (e *periodicExporter) scrapeMaster() {
	stateURL := fmt.Sprintf("%s://%s/master/state.json", e.queryURL.Scheme, e.queryURL.Host)

	log.Debugf("Scraping master at %s", stateURL)

	var state masterState

	err := getJSON(&state, stateURL)
	if err != nil {
		log.Warn(err)
		return
	}

	metrics := []prometheus.Metric{}

	for _, fw := range state.Frameworks {
		metrics = append(metrics, prometheus.MustNewConstMetric(
			frameworkResourcesUsedCPUs,
			prometheus.GaugeValue,
			fw.UsedResources.CPUs,
			fw.ID, fw.Name,
		))
		metrics = append(metrics, prometheus.MustNewConstMetric(
			frameworkResourcesUsedDisk,
			prometheus.GaugeValue,
			megabytesToBytes(fw.UsedResources.Disk),
			fw.ID, fw.Name,
		))
		metrics = append(metrics, prometheus.MustNewConstMetric(
			frameworkResourcesUsedMemory,
			prometheus.GaugeValue,
			megabytesToBytes(fw.UsedResources.Mem),
			fw.ID, fw.Name,
		))
	}

	snapshotURL := fmt.Sprintf("%s://%s/metrics/snapshot", e.queryURL.Scheme, e.queryURL.Host)

	var ms metricsSnapshot

	err = getJSON(&ms, snapshotURL)
	if err != nil {
		log.Warn(err)
		return
	}

	for _, mm := range masterMetrics {
		metricValue, ok := ms[mm.snapshotKey]
		if !ok {
			continue
		}

		if mm.convertFn != nil {
			metricValue = mm.convertFn(metricValue)
		}

		metrics = append(metrics, prometheus.MustNewConstMetric(
			mm.desc, mm.valueType, metricValue, state.Hostname,
		))
	}

	e.Lock()
	e.metrics = metrics
	e.Unlock()
}
Пример #14
0
// loadSeriesMapAndHeads loads the fingerprint to memory-series mapping and all
// the chunks contained in the checkpoint (and thus not yet persisted to series
// files). The method is capable of loading the checkpoint format v1 and v2. If
// recoverable corruption is detected, or if the dirty flag was set from the
// beginning, crash recovery is run, which might take a while. If an
// unrecoverable error is encountered, it is returned. Call this method during
// start-up while nothing else is running in storage land. This method is
// utterly goroutine-unsafe.
func (p *persistence) loadSeriesMapAndHeads() (sm *seriesMap, chunksToPersist int64, err error) {
	var chunkDescsTotal int64
	fingerprintToSeries := make(map[clientmodel.Fingerprint]*memorySeries)
	sm = &seriesMap{m: fingerprintToSeries}

	defer func() {
		if sm != nil && p.dirty {
			log.Warn("Persistence layer appears dirty.")
			err = p.recoverFromCrash(fingerprintToSeries)
			if err != nil {
				sm = nil
			}
		}
		if err == nil {
			numMemChunkDescs.Add(float64(chunkDescsTotal))
		}
	}()

	f, err := os.Open(p.headsFileName())
	if os.IsNotExist(err) {
		return sm, 0, nil
	}
	if err != nil {
		log.Warn("Could not open heads file:", err)
		p.dirty = true
		return
	}
	defer f.Close()
	r := bufio.NewReaderSize(f, fileBufSize)

	buf := make([]byte, len(headsMagicString))
	if _, err := io.ReadFull(r, buf); err != nil {
		log.Warn("Could not read from heads file:", err)
		p.dirty = true
		return sm, 0, nil
	}
	magic := string(buf)
	if magic != headsMagicString {
		log.Warnf(
			"unexpected magic string, want %q, got %q",
			headsMagicString, magic,
		)
		p.dirty = true
		return
	}
	version, err := binary.ReadVarint(r)
	if (version != headsFormatVersion && version != headsFormatLegacyVersion) || err != nil {
		log.Warnf("unknown heads format version, want %d", headsFormatVersion)
		p.dirty = true
		return sm, 0, nil
	}
	numSeries, err := codable.DecodeUint64(r)
	if err != nil {
		log.Warn("Could not decode number of series:", err)
		p.dirty = true
		return sm, 0, nil
	}

	for ; numSeries > 0; numSeries-- {
		seriesFlags, err := r.ReadByte()
		if err != nil {
			log.Warn("Could not read series flags:", err)
			p.dirty = true
			return sm, chunksToPersist, nil
		}
		headChunkPersisted := seriesFlags&flagHeadChunkPersisted != 0
		fp, err := codable.DecodeUint64(r)
		if err != nil {
			log.Warn("Could not decode fingerprint:", err)
			p.dirty = true
			return sm, chunksToPersist, nil
		}
		var metric codable.Metric
		if err := metric.UnmarshalFromReader(r); err != nil {
			log.Warn("Could not decode metric:", err)
			p.dirty = true
			return sm, chunksToPersist, nil
		}
		var persistWatermark int64
		var modTime time.Time
		if version != headsFormatLegacyVersion {
			// persistWatermark only present in v2.
			persistWatermark, err = binary.ReadVarint(r)
			if err != nil {
				log.Warn("Could not decode persist watermark:", err)
				p.dirty = true
				return sm, chunksToPersist, nil
			}
			modTimeNano, err := binary.ReadVarint(r)
			if err != nil {
				log.Warn("Could not decode modification time:", err)
				p.dirty = true
				return sm, chunksToPersist, nil
			}
			if modTimeNano != -1 {
				modTime = time.Unix(0, modTimeNano)
			}
		}
		chunkDescsOffset, err := binary.ReadVarint(r)
		if err != nil {
			log.Warn("Could not decode chunk descriptor offset:", err)
			p.dirty = true
			return sm, chunksToPersist, nil
		}
		savedFirstTime, err := binary.ReadVarint(r)
		if err != nil {
			log.Warn("Could not decode saved first time:", err)
			p.dirty = true
			return sm, chunksToPersist, nil
		}
		numChunkDescs, err := binary.ReadVarint(r)
		if err != nil {
			log.Warn("Could not decode number of chunk descriptors:", err)
			p.dirty = true
			return sm, chunksToPersist, nil
		}
		chunkDescs := make([]*chunkDesc, numChunkDescs)
		if version == headsFormatLegacyVersion {
			if headChunkPersisted {
				persistWatermark = numChunkDescs
			} else {
				persistWatermark = numChunkDescs - 1
			}
		}

		for i := int64(0); i < numChunkDescs; i++ {
			if i < persistWatermark {
				firstTime, err := binary.ReadVarint(r)
				if err != nil {
					log.Warn("Could not decode first time:", err)
					p.dirty = true
					return sm, chunksToPersist, nil
				}
				lastTime, err := binary.ReadVarint(r)
				if err != nil {
					log.Warn("Could not decode last time:", err)
					p.dirty = true
					return sm, chunksToPersist, nil
				}
				chunkDescs[i] = &chunkDesc{
					chunkFirstTime: clientmodel.Timestamp(firstTime),
					chunkLastTime:  clientmodel.Timestamp(lastTime),
				}
				chunkDescsTotal++
			} else {
				// Non-persisted chunk.
				encoding, err := r.ReadByte()
				if err != nil {
					log.Warn("Could not decode chunk type:", err)
					p.dirty = true
					return sm, chunksToPersist, nil
				}
				chunk := newChunkForEncoding(chunkEncoding(encoding))
				if err := chunk.unmarshal(r); err != nil {
					log.Warn("Could not decode chunk:", err)
					p.dirty = true
					return sm, chunksToPersist, nil
				}
				chunkDescs[i] = newChunkDesc(chunk)
				chunksToPersist++
			}
		}

		fingerprintToSeries[clientmodel.Fingerprint(fp)] = &memorySeries{
			metric:           clientmodel.Metric(metric),
			chunkDescs:       chunkDescs,
			persistWatermark: int(persistWatermark),
			modTime:          modTime,
			chunkDescsOffset: int(chunkDescsOffset),
			savedFirstTime:   clientmodel.Timestamp(savedFirstTime),
			headChunkClosed:  persistWatermark >= numChunkDescs,
		}
	}
	return sm, chunksToPersist, nil
}
Пример #15
0
func main() {
	flag.Parse()

	if !strings.HasPrefix(*pathPrefix, "/") {
		*pathPrefix = "/" + *pathPrefix
	}
	if !strings.HasSuffix(*pathPrefix, "/") {
		*pathPrefix = *pathPrefix + "/"
	}

	versionInfoTmpl.Execute(os.Stdout, BuildInfo)

	conf := config.MustLoadFromFile(*configFile)

	silencer := manager.NewSilencer()
	defer silencer.Close()

	err := silencer.LoadFromFile(*silencesFile)
	if err != nil {
		log.Warn("Couldn't load silences, starting up with empty silence list: ", err)
	}
	saveSilencesTicker := time.NewTicker(10 * time.Second)
	go func() {
		for range saveSilencesTicker.C {
			if err := silencer.SaveToFile(*silencesFile); err != nil {
				log.Error("Error saving silences to file: ", err)
			}
		}
	}()
	defer saveSilencesTicker.Stop()

	amURL, err := alertmanagerURL(*hostname, *pathPrefix, *listenAddress, *externalURL)
	if err != nil {
		log.Fatalln("Error building Alertmanager URL:", err)
	}
	notifier := manager.NewNotifier(conf.NotificationConfig, amURL)
	defer notifier.Close()

	inhibitor := new(manager.Inhibitor)
	inhibitor.SetInhibitRules(conf.InhibitRules())

	options := &manager.MemoryAlertManagerOptions{
		Inhibitor:          inhibitor,
		Silencer:           silencer,
		Notifier:           notifier,
		MinRefreshInterval: *minRefreshPeriod,
	}
	alertManager := manager.NewMemoryAlertManager(options)
	alertManager.SetAggregationRules(conf.AggregationRules())
	go alertManager.Run()

	// Web initialization.
	flags := map[string]string{}
	flag.VisitAll(func(f *flag.Flag) {
		flags[f.Name] = f.Value.String()
	})

	statusHandler := &web.StatusHandler{
		Config:     conf.String(),
		Flags:      flags,
		BuildInfo:  BuildInfo,
		Birth:      time.Now(),
		PathPrefix: *pathPrefix,
	}

	webService := &web.WebService{
		// REST API Service.
		AlertManagerService: &api.AlertManagerService{
			Manager:    alertManager,
			Silencer:   silencer,
			PathPrefix: *pathPrefix,
		},

		// Template-based page handlers.
		AlertsHandler: &web.AlertsHandler{
			Manager:                alertManager,
			IsSilencedInterrogator: silencer,
			PathPrefix:             *pathPrefix,
		},
		SilencesHandler: &web.SilencesHandler{
			Silencer:   silencer,
			PathPrefix: *pathPrefix,
		},
		StatusHandler: statusHandler,
	}
	go webService.ServeForever(*listenAddress, *pathPrefix)

	// React to configuration changes.
	watcher := config.NewFileWatcher(*configFile)
	go watcher.Watch(func(conf *config.Config) {
		inhibitor.SetInhibitRules(conf.InhibitRules())
		notifier.SetNotificationConfigs(conf.NotificationConfig)
		alertManager.SetAggregationRules(conf.AggregationRules())
		statusHandler.UpdateConfig(conf.String())
	})

	log.Info("Running notification dispatcher...")
	notifier.Dispatch()
}
Пример #16
0
// recoverFromCrash is called by loadSeriesMapAndHeads if the persistence
// appears to be dirty after the loading (either because the loading resulted in
// an error or because the persistence was dirty from the start). Not goroutine
// safe. Only call before anything else is running (except index processing
// queue as started by newPersistence).
func (p *persistence) recoverFromCrash(fingerprintToSeries map[clientmodel.Fingerprint]*memorySeries) error {
	// TODO(beorn): We need proper tests for the crash recovery.
	log.Warn("Starting crash recovery. Prometheus is inoperational until complete.")
	log.Warn("To avoid crash recovery in future, shutdown Prometheus with SIGTERM or a HTTP POST to /-/quit.")

	fpsSeen := map[clientmodel.Fingerprint]struct{}{}
	count := 0
	seriesDirNameFmt := fmt.Sprintf("%%0%dx", seriesDirNameLen)

	// Delete the fingerprint mapping file as it might be stale or
	// corrupt. We'll rebuild the mappings as we go.
	os.Remove(p.mappingsFileName())
	// The mappings to rebuild.
	fpm := fpMappings{}

	log.Info("Scanning files.")
	for i := 0; i < 1<<(seriesDirNameLen*4); i++ {
		dirname := path.Join(p.basePath, fmt.Sprintf(seriesDirNameFmt, i))
		dir, err := os.Open(dirname)
		if os.IsNotExist(err) {
			continue
		}
		if err != nil {
			return err
		}
		defer dir.Close()
		for fis := []os.FileInfo{}; err != io.EOF; fis, err = dir.Readdir(1024) {
			if err != nil {
				return err
			}
			for _, fi := range fis {
				fp, ok := p.sanitizeSeries(dirname, fi, fingerprintToSeries, fpm)
				if ok {
					fpsSeen[fp] = struct{}{}
				}
				count++
				if count%10000 == 0 {
					log.Infof("%d files scanned.", count)
				}
			}
		}
	}
	log.Infof("File scan complete. %d series found.", len(fpsSeen))

	log.Info("Checking for series without series file.")
	for fp, s := range fingerprintToSeries {
		if _, seen := fpsSeen[fp]; !seen {
			// fp exists in fingerprintToSeries, but has no representation on disk.
			if s.persistWatermark == len(s.chunkDescs) {
				// Oops, everything including the head chunk was
				// already persisted, but nothing on disk.
				// Thus, we lost that series completely. Clean
				// up the remnants.
				delete(fingerprintToSeries, fp)
				if err := p.purgeArchivedMetric(fp); err != nil {
					// Purging the archived metric didn't work, so try
					// to unindex it, just in case it's in the indexes.
					p.unindexMetric(fp, s.metric)
				}
				log.Warnf("Lost series detected: fingerprint %v, metric %v.", fp, s.metric)
				continue
			}
			// If we are here, the only chunks we have are the chunks in the checkpoint.
			// Adjust things accordingly.
			if s.persistWatermark > 0 || s.chunkDescsOffset != 0 {
				minLostChunks := s.persistWatermark + s.chunkDescsOffset
				if minLostChunks <= 0 {
					log.Warnf(
						"Possible loss of chunks for fingerprint %v, metric %v.",
						fp, s.metric,
					)
				} else {
					log.Warnf(
						"Lost at least %d chunks for fingerprint %v, metric %v.",
						minLostChunks, fp, s.metric,
					)
				}
				s.chunkDescs = append(
					make([]*chunkDesc, 0, len(s.chunkDescs)-s.persistWatermark),
					s.chunkDescs[s.persistWatermark:]...,
				)
				numMemChunkDescs.Sub(float64(s.persistWatermark))
				s.persistWatermark = 0
				s.chunkDescsOffset = 0
			}
			maybeAddMapping(fp, s.metric, fpm)
			fpsSeen[fp] = struct{}{} // Add so that fpsSeen is complete.
		}
	}
	log.Info("Check for series without series file complete.")

	if err := p.cleanUpArchiveIndexes(fingerprintToSeries, fpsSeen, fpm); err != nil {
		return err
	}
	if err := p.rebuildLabelIndexes(fingerprintToSeries); err != nil {
		return err
	}
	// Finally rewrite the mappings file if there are any mappings.
	if len(fpm) > 0 {
		if err := p.checkpointFPMappings(fpm); err != nil {
			return err
		}
	}

	p.setDirty(false)
	log.Warn("Crash recovery complete.")
	return nil
}
Пример #17
0
func Main() int {
	if err := parse(os.Args[1:]); err != nil {
		return 2
	}

	versionInfoTmpl.Execute(os.Stdout, BuildInfo)
	if cfg.printVersion {
		return 0
	}

	memStorage := local.NewMemorySeriesStorage(&cfg.storage)

	var (
		sampleAppender      storage.SampleAppender
		remoteStorageQueues []*remote.StorageQueueManager
	)
	if cfg.opentsdbURL == "" && cfg.influxdbURL == "" {
		log.Warnf("No remote storage URLs provided; not sending any samples to long-term storage")
		sampleAppender = memStorage
	} else {
		fanout := storage.Fanout{memStorage}

		addRemoteStorage := func(c remote.StorageClient) {
			qm := remote.NewStorageQueueManager(c, 100*1024)
			fanout = append(fanout, qm)
			remoteStorageQueues = append(remoteStorageQueues, qm)
		}

		if cfg.opentsdbURL != "" {
			addRemoteStorage(opentsdb.NewClient(cfg.opentsdbURL, cfg.remoteStorageTimeout))
		}
		if cfg.influxdbURL != "" {
			addRemoteStorage(influxdb.NewClient(cfg.influxdbURL, cfg.remoteStorageTimeout, cfg.influxdbDatabase, cfg.influxdbRetentionPolicy))
		}

		sampleAppender = fanout
	}

	var (
		notificationHandler = notification.NewNotificationHandler(&cfg.notification)
		targetManager       = retrieval.NewTargetManager(sampleAppender)
		queryEngine         = promql.NewEngine(memStorage, &cfg.queryEngine)
	)

	ruleManager := rules.NewManager(&rules.ManagerOptions{
		SampleAppender:      sampleAppender,
		NotificationHandler: notificationHandler,
		QueryEngine:         queryEngine,
		PrometheusURL:       cfg.prometheusURL,
		PathPrefix:          cfg.web.PathPrefix,
	})

	flags := map[string]string{}
	cfg.fs.VisitAll(func(f *flag.Flag) {
		flags[f.Name] = f.Value.String()
	})

	status := &web.PrometheusStatus{
		BuildInfo:   BuildInfo,
		TargetPools: targetManager.Pools,
		Rules:       ruleManager.Rules,
		Flags:       flags,
		Birth:       time.Now(),
	}

	webHandler := web.New(memStorage, queryEngine, ruleManager, status, &cfg.web)

	if !reloadConfig(cfg.configFile, status, targetManager, ruleManager) {
		os.Exit(1)
	}

	// Wait for reload or termination signals. Start the handler for SIGHUP as
	// early as possible, but ignore it until we are ready to handle reloading
	// our config.
	hup := make(chan os.Signal)
	hupReady := make(chan bool)
	signal.Notify(hup, syscall.SIGHUP)
	go func() {
		<-hupReady
		for range hup {
			reloadConfig(cfg.configFile, status, targetManager, ruleManager)
		}
	}()

	// Start all components.
	if err := memStorage.Start(); err != nil {
		log.Errorln("Error opening memory series storage:", err)
		return 1
	}
	defer func() {
		if err := memStorage.Stop(); err != nil {
			log.Errorln("Error stopping storage:", err)
		}
	}()

	// The storage has to be fully initialized before registering.
	registry.MustRegister(memStorage)
	registry.MustRegister(notificationHandler)

	for _, q := range remoteStorageQueues {
		registry.MustRegister(q)

		go q.Run()
		defer q.Stop()
	}

	go ruleManager.Run()
	defer ruleManager.Stop()

	go notificationHandler.Run()
	defer notificationHandler.Stop()

	go targetManager.Run()
	defer targetManager.Stop()

	defer queryEngine.Stop()

	go webHandler.Run()

	// Wait for reload or termination signals.
	close(hupReady) // Unblock SIGHUP handler.

	term := make(chan os.Signal)
	signal.Notify(term, os.Interrupt, syscall.SIGTERM)
	select {
	case <-term:
		log.Warn("Received SIGTERM, exiting gracefully...")
	case <-webHandler.Quit():
		log.Warn("Received termination request via web service, exiting gracefully...")
	}

	close(hup)

	log.Info("See you next time!")
	return 0
}
Пример #18
0
// Main manages the startup and shutdown lifecycle of the entire Prometheus server.
func Main() int {
	if err := parse(os.Args[1:]); err != nil {
		return 2
	}

	printVersion()
	if cfg.printVersion {
		return 0
	}

	var reloadables []Reloadable

	var (
		memStorage     = local.NewMemorySeriesStorage(&cfg.storage)
		remoteStorage  = remote.New(&cfg.remote)
		sampleAppender = storage.Fanout{memStorage}
	)
	if remoteStorage != nil {
		sampleAppender = append(sampleAppender, remoteStorage)
		reloadables = append(reloadables, remoteStorage)
	}

	var (
		notificationHandler = notification.NewNotificationHandler(&cfg.notification)
		targetManager       = retrieval.NewTargetManager(sampleAppender)
		queryEngine         = promql.NewEngine(memStorage, &cfg.queryEngine)
	)

	ruleManager := rules.NewManager(&rules.ManagerOptions{
		SampleAppender:      sampleAppender,
		NotificationHandler: notificationHandler,
		QueryEngine:         queryEngine,
		ExternalURL:         cfg.web.ExternalURL,
	})

	flags := map[string]string{}
	cfg.fs.VisitAll(func(f *flag.Flag) {
		flags[f.Name] = f.Value.String()
	})

	status := &web.PrometheusStatus{
		TargetPools: targetManager.Pools,
		Rules:       ruleManager.Rules,
		Flags:       flags,
		Birth:       time.Now(),
	}

	webHandler := web.New(memStorage, queryEngine, ruleManager, status, &cfg.web)

	reloadables = append(reloadables, status, targetManager, ruleManager, webHandler, notificationHandler)

	if !reloadConfig(cfg.configFile, reloadables...) {
		return 1
	}

	// Wait for reload or termination signals. Start the handler for SIGHUP as
	// early as possible, but ignore it until we are ready to handle reloading
	// our config.
	hup := make(chan os.Signal)
	hupReady := make(chan bool)
	signal.Notify(hup, syscall.SIGHUP)
	go func() {
		<-hupReady
		for {
			select {
			case <-hup:
			case <-webHandler.Reload():
			}
			reloadConfig(cfg.configFile, reloadables...)
		}
	}()

	// Start all components.
	if err := memStorage.Start(); err != nil {
		log.Errorln("Error opening memory series storage:", err)
		return 1
	}
	defer func() {
		if err := memStorage.Stop(); err != nil {
			log.Errorln("Error stopping storage:", err)
		}
	}()

	if remoteStorage != nil {
		prometheus.MustRegister(remoteStorage)

		go remoteStorage.Run()
		defer remoteStorage.Stop()
	}
	// The storage has to be fully initialized before registering.
	prometheus.MustRegister(memStorage)
	prometheus.MustRegister(notificationHandler)
	prometheus.MustRegister(configSuccess)
	prometheus.MustRegister(configSuccessTime)

	go ruleManager.Run()
	defer ruleManager.Stop()

	go notificationHandler.Run()
	defer notificationHandler.Stop()

	go targetManager.Run()
	defer targetManager.Stop()

	defer queryEngine.Stop()

	go webHandler.Run()

	// Wait for reload or termination signals.
	close(hupReady) // Unblock SIGHUP handler.

	term := make(chan os.Signal)
	signal.Notify(term, os.Interrupt, syscall.SIGTERM)
	select {
	case <-term:
		log.Warn("Received SIGTERM, exiting gracefully...")
	case <-webHandler.Quit():
		log.Warn("Received termination request via web service, exiting gracefully...")
	case err := <-webHandler.ListenError():
		log.Errorln("Error starting web server, exiting gracefully:", err)
	}

	log.Info("See you next time!")
	return 0
}