Ejemplo n.º 1
0
func (h Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
	ctx := route.Context(r)

	name := strings.Trim(route.Param(ctx, "filepath"), "/")
	if name == "" {
		name = "index.html"
	}

	file, err := GetFile(StaticFiles, name)
	if err != nil {
		if err != io.EOF {
			log.Warn("Could not get file: ", err)
		}
		w.WriteHeader(http.StatusNotFound)
		return
	}
	contentType := http.DetectContentType(file)
	if strings.Contains(contentType, "text/plain") || strings.Contains(contentType, "application/octet-stream") {
		parts := strings.Split(name, ".")
		contentType = mimeMap[parts[len(parts)-1]]
	}
	w.Header().Set("Content-Type", contentType)
	w.Header().Set("Cache-Control", "public, max-age=259200")
	w.Write(file)
}
Ejemplo n.º 2
0
// Collect implements prometheus.Collector.
func (c *viewCollector) Collect(ch chan<- prometheus.Metric) {
	for _, v := range c.stats.Views {
		for _, s := range v.Cache {
			ch <- prometheus.MustNewConstMetric(
				resolverCache, prometheus.GaugeValue, float64(s.Gauge), v.Name, s.Name,
			)
		}
		for _, s := range v.ResolverQueries {
			ch <- prometheus.MustNewConstMetric(
				resolverQueries, prometheus.CounterValue, float64(s.Counter), v.Name, s.Name,
			)
		}
		for _, s := range v.ResolverStats {
			if desc, ok := resolverMetricStats[s.Name]; ok {
				ch <- prometheus.MustNewConstMetric(
					desc, prometheus.CounterValue, float64(s.Counter), v.Name,
				)
			}
			if desc, ok := resolverLabelStats[s.Name]; ok {
				ch <- prometheus.MustNewConstMetric(
					desc, prometheus.CounterValue, float64(s.Counter), v.Name, s.Name,
				)
			}
		}
		if buckets, count, err := histogram(v.ResolverStats); err == nil {
			ch <- prometheus.MustNewConstHistogram(
				resolverQueryDuration, count, math.NaN(), buckets, v.Name,
			)
		} else {
			log.Warn("Error parsing RTT:", err)
		}
	}
}
Ejemplo n.º 3
0
// loadSeriesMapAndHeads loads the fingerprint to memory-series mapping and all
// the chunks contained in the checkpoint (and thus not yet persisted to series
// files). The method is capable of loading the checkpoint format v1 and v2. If
// recoverable corruption is detected, or if the dirty flag was set from the
// beginning, crash recovery is run, which might take a while. If an
// unrecoverable error is encountered, it is returned. Call this method during
// start-up while nothing else is running in storage land. This method is
// utterly goroutine-unsafe.
func (p *persistence) loadSeriesMapAndHeads() (sm *seriesMap, chunksToPersist int64, err error) {
	fingerprintToSeries := make(map[model.Fingerprint]*memorySeries)
	sm = &seriesMap{m: fingerprintToSeries}

	defer func() {
		if p.dirty {
			log.Warn("Persistence layer appears dirty.")
			p.startedDirty.Set(1)
			err = p.recoverFromCrash(fingerprintToSeries)
			if err != nil {
				sm = nil
			}
		} else {
			p.startedDirty.Set(0)
		}
	}()

	hs := newHeadsScanner(p.headsFileName())
	defer hs.close()
	for hs.scan() {
		fingerprintToSeries[hs.fp] = hs.series
	}
	if os.IsNotExist(hs.err) {
		return sm, 0, nil
	}
	if hs.err != nil {
		p.dirty = true
		log.
			With("file", p.headsFileName()).
			With("error", hs.err).
			Error("Error reading heads file.")
		return sm, 0, hs.err
	}
	return sm, hs.chunksToPersistTotal, nil
}
Ejemplo n.º 4
0
// Append queues a sample to be sent to the remote storage. It drops the
// sample on the floor if the queue is full. It implements
// storage.SampleAppender.
func (t *StorageQueueManager) Append(s *model.Sample) {
	select {
	case t.queue <- s:
	default:
		t.samplesCount.WithLabelValues(dropped).Inc()
		log.Warn("Remote storage queue full, discarding sample.")
	}
}
Ejemplo n.º 5
0
// Append queues a sample to be sent to the remote storage. It drops the
// sample on the floor if the queue is full.
// Always returns nil.
func (t *StorageQueueManager) Append(s *model.Sample) error {
	fp := s.Metric.FastFingerprint()
	shard := uint64(fp) % uint64(t.cfg.Shards)

	select {
	case t.shards[shard] <- s:
	default:
		t.sentSamplesTotal.WithLabelValues(dropped).Inc()
		log.Warn("Remote storage queue full, discarding sample.")
	}
	return nil
}
Ejemplo n.º 6
0
// Append implements Storage.
func (s *memorySeriesStorage) Append(sample *model.Sample) {
	for ln, lv := range sample.Metric {
		if len(lv) == 0 {
			delete(sample.Metric, ln)
		}
	}
	if s.getNumChunksToPersist() >= s.maxChunksToPersist {
		log.Warnf(
			"%d chunks waiting for persistence, sample ingestion suspended.",
			s.getNumChunksToPersist(),
		)
		for s.getNumChunksToPersist() >= s.maxChunksToPersist {
			time.Sleep(time.Second)
		}
		log.Warn("Sample ingestion resumed.")
	}
	rawFP := sample.Metric.FastFingerprint()
	s.fpLocker.Lock(rawFP)
	fp, err := s.mapper.mapFP(rawFP, sample.Metric)
	if err != nil {
		log.Errorf("Error while mapping fingerprint %v: %v", rawFP, err)
		s.persistence.setDirty(true)
	}
	if fp != rawFP {
		// Switch locks.
		s.fpLocker.Unlock(rawFP)
		s.fpLocker.Lock(fp)
	}
	series := s.getOrCreateSeries(fp, sample.Metric)

	if sample.Timestamp <= series.lastTime {
		// Don't log and track equal timestamps, as they are a common occurrence
		// when using client-side timestamps (e.g. Pushgateway or federation).
		// It would be even better to also compare the sample values here, but
		// we don't have efficient access to a series's last value.
		if sample.Timestamp != series.lastTime {
			log.Warnf("Ignoring sample with out-of-order timestamp for fingerprint %v (%v): %v is not after %v", fp, series.metric, sample.Timestamp, series.lastTime)
			s.outOfOrderSamplesCount.Inc()
		}
		s.fpLocker.Unlock(fp)
		return
	}
	completedChunksCount := series.add(&model.SamplePair{
		Value:     sample.Value,
		Timestamp: sample.Timestamp,
	})
	s.fpLocker.Unlock(fp)
	s.ingestedSamplesCount.Inc()
	s.incNumChunksToPersist(completedChunksCount)
}
Ejemplo n.º 7
0
// isDegraded returns whether the storage is in "graceful degradation mode",
// which is the case if the number of chunks waiting for persistence has reached
// a percentage of maxChunksToPersist that exceeds
// percentChunksToPersistForDegradation. The method is not goroutine safe (but
// only ever called from the goroutine dealing with series maintenance).
// Changes of degradation mode are logged.
func (s *memorySeriesStorage) isDegraded() bool {
	nowDegraded := s.getNumChunksToPersist() > s.maxChunksToPersist*percentChunksToPersistForDegradation/100
	if s.degraded && !nowDegraded {
		log.Warn("Storage has left graceful degradation mode. Things are back to normal.")
	} else if !s.degraded && nowDegraded {
		log.Warnf(
			"%d chunks waiting for persistence (%d%% of the allowed maximum %d). Storage is now in graceful degradation mode. Series files are not synced anymore if following the adaptive strategy. Checkpoints are not performed more often than every %v. Series maintenance happens as frequently as possible.",
			s.getNumChunksToPersist(),
			s.getNumChunksToPersist()*100/s.maxChunksToPersist,
			s.maxChunksToPersist,
			s.checkpointInterval)
	}
	s.degraded = nowDegraded
	return s.degraded
}
Ejemplo n.º 8
0
func serveAsset(w http.ResponseWriter, req *http.Request, fp string) {
	info, err := ui.AssetInfo(fp)
	if err != nil {
		log.Warn("Could not get file: ", err)
		w.WriteHeader(http.StatusNotFound)
		return
	}
	file, err := ui.Asset(fp)
	if err != nil {
		if err != io.EOF {
			log.With("file", fp).Warn("Could not get file: ", err)
		}
		w.WriteHeader(http.StatusNotFound)
		return
	}

	http.ServeContent(w, req, info.Name(), info.ModTime(), bytes.NewReader(file))
}
Ejemplo n.º 9
0
// Run dispatches notifications continuously.
func (n *NotificationHandler) Run() {
	for reqs := range n.pendingNotifications {
		if n.alertmanagerURL == "" {
			log.Warn("No alert manager configured, not dispatching notification")
			n.notificationDropped.Inc()
			continue
		}

		begin := time.Now()
		err := n.sendNotifications(reqs)

		if err != nil {
			log.Error("Error sending notification: ", err)
			n.notificationErrors.Inc()
		}

		n.notificationLatency.Observe(float64(time.Since(begin) / time.Millisecond))
	}
	close(n.stopped)
}
Ejemplo n.º 10
0
// Main manages the startup and shutdown lifecycle of the entire Prometheus server.
func Main() int {
	if err := parse(os.Args[1:]); err != nil {
		return 2
	}

	printVersion()
	if cfg.printVersion {
		return 0
	}

	var reloadables []Reloadable

	var (
		memStorage     = local.NewMemorySeriesStorage(&cfg.storage)
		remoteStorage  = remote.New(&cfg.remote)
		sampleAppender = storage.Fanout{memStorage}
	)
	if remoteStorage != nil {
		sampleAppender = append(sampleAppender, remoteStorage)
		reloadables = append(reloadables, remoteStorage)
	}

	var (
		notificationHandler = notification.New(&cfg.notification)
		targetManager       = retrieval.NewTargetManager(sampleAppender)
		queryEngine         = promql.NewEngine(memStorage, &cfg.queryEngine)
	)

	ruleManager := rules.NewManager(&rules.ManagerOptions{
		SampleAppender:      sampleAppender,
		NotificationHandler: notificationHandler,
		QueryEngine:         queryEngine,
		ExternalURL:         cfg.web.ExternalURL,
	})

	flags := map[string]string{}
	cfg.fs.VisitAll(func(f *flag.Flag) {
		flags[f.Name] = f.Value.String()
	})

	status := &web.PrometheusStatus{
		TargetPools: targetManager.Pools,
		Rules:       ruleManager.Rules,
		Flags:       flags,
		Birth:       time.Now(),
	}

	webHandler := web.New(memStorage, queryEngine, ruleManager, status, &cfg.web)

	reloadables = append(reloadables, status, targetManager, ruleManager, webHandler, notificationHandler)

	if !reloadConfig(cfg.configFile, reloadables...) {
		return 1
	}

	// Wait for reload or termination signals. Start the handler for SIGHUP as
	// early as possible, but ignore it until we are ready to handle reloading
	// our config.
	hup := make(chan os.Signal)
	hupReady := make(chan bool)
	signal.Notify(hup, syscall.SIGHUP)
	go func() {
		<-hupReady
		for {
			select {
			case <-hup:
			case <-webHandler.Reload():
			}
			reloadConfig(cfg.configFile, reloadables...)
		}
	}()

	// Start all components.
	if err := memStorage.Start(); err != nil {
		log.Errorln("Error opening memory series storage:", err)
		return 1
	}
	defer func() {
		if err := memStorage.Stop(); err != nil {
			log.Errorln("Error stopping storage:", err)
		}
	}()

	if remoteStorage != nil {
		prometheus.MustRegister(remoteStorage)

		go remoteStorage.Run()
		defer remoteStorage.Stop()
	}
	// The storage has to be fully initialized before registering.
	prometheus.MustRegister(memStorage)
	prometheus.MustRegister(notificationHandler)
	prometheus.MustRegister(configSuccess)
	prometheus.MustRegister(configSuccessTime)

	defer ruleManager.Stop()

	go notificationHandler.Run()
	defer notificationHandler.Stop()

	go targetManager.Run()
	defer targetManager.Stop()

	defer queryEngine.Stop()

	go webHandler.Run()

	// Wait for reload or termination signals.
	close(hupReady) // Unblock SIGHUP handler.

	term := make(chan os.Signal)
	signal.Notify(term, os.Interrupt, syscall.SIGTERM)
	select {
	case <-term:
		log.Warn("Received SIGTERM, exiting gracefully...")
	case <-webHandler.Quit():
		log.Warn("Received termination request via web service, exiting gracefully...")
	case err := <-webHandler.ListenError():
		log.Errorln("Error starting web server, exiting gracefully:", err)
	}

	log.Info("See you next time!")
	return 0
}
Ejemplo n.º 11
0
// Main manages the startup and shutdown lifecycle of the entire Prometheus server.
func Main() int {
	if err := parse(os.Args[1:]); err != nil {
		log.Error(err)
		return 2
	}

	if cfg.printVersion {
		fmt.Fprintln(os.Stdout, version.Print("prometheus"))
		return 0
	}

	log.Infoln("Starting prometheus", version.Info())
	log.Infoln("Build context", version.BuildContext())

	var reloadables []Reloadable

	var (
		memStorage     = local.NewMemorySeriesStorage(&cfg.storage)
		remoteStorage  = remote.New(&cfg.remote)
		sampleAppender = storage.Fanout{memStorage}
	)
	if remoteStorage != nil {
		sampleAppender = append(sampleAppender, remoteStorage)
		reloadables = append(reloadables, remoteStorage)
	}

	var (
		notifier      = notifier.New(&cfg.notifier)
		targetManager = retrieval.NewTargetManager(sampleAppender)
		queryEngine   = promql.NewEngine(memStorage, &cfg.queryEngine)
	)

	ruleManager := rules.NewManager(&rules.ManagerOptions{
		SampleAppender: sampleAppender,
		Notifier:       notifier,
		QueryEngine:    queryEngine,
		ExternalURL:    cfg.web.ExternalURL,
	})

	flags := map[string]string{}
	cfg.fs.VisitAll(func(f *flag.Flag) {
		flags[f.Name] = f.Value.String()
	})

	version := &web.PrometheusVersion{
		Version:   version.Version,
		Revision:  version.Revision,
		Branch:    version.Branch,
		BuildUser: version.BuildUser,
		BuildDate: version.BuildDate,
		GoVersion: version.GoVersion,
	}

	webHandler := web.New(memStorage, queryEngine, targetManager, ruleManager, version, flags, &cfg.web)

	reloadables = append(reloadables, targetManager, ruleManager, webHandler, notifier)

	if !reloadConfig(cfg.configFile, reloadables...) {
		return 1
	}

	// Wait for reload or termination signals. Start the handler for SIGHUP as
	// early as possible, but ignore it until we are ready to handle reloading
	// our config.
	hup := make(chan os.Signal)
	hupReady := make(chan bool)
	signal.Notify(hup, syscall.SIGHUP)
	go func() {
		<-hupReady
		for {
			select {
			case <-hup:
			case <-webHandler.Reload():
			}
			reloadConfig(cfg.configFile, reloadables...)
		}
	}()

	// Start all components. The order is NOT arbitrary.

	if err := memStorage.Start(); err != nil {
		log.Errorln("Error opening memory series storage:", err)
		return 1
	}
	defer func() {
		if err := memStorage.Stop(); err != nil {
			log.Errorln("Error stopping storage:", err)
		}
	}()

	if remoteStorage != nil {
		prometheus.MustRegister(remoteStorage)

		go remoteStorage.Run()
		defer remoteStorage.Stop()
	}
	// The storage has to be fully initialized before registering.
	prometheus.MustRegister(memStorage)
	prometheus.MustRegister(notifier)
	prometheus.MustRegister(configSuccess)
	prometheus.MustRegister(configSuccessTime)

	// The notifieris a dependency of the rule manager. It has to be
	// started before and torn down afterwards.
	go notifier.Run()
	defer notifier.Stop()

	go ruleManager.Run()
	defer ruleManager.Stop()

	go targetManager.Run()
	defer targetManager.Stop()

	// Shutting down the query engine before the rule manager will cause pending queries
	// to be canceled and ensures a quick shutdown of the rule manager.
	defer queryEngine.Stop()

	go webHandler.Run()

	// Wait for reload or termination signals.
	close(hupReady) // Unblock SIGHUP handler.

	term := make(chan os.Signal)
	signal.Notify(term, os.Interrupt, syscall.SIGTERM)
	select {
	case <-term:
		log.Warn("Received SIGTERM, exiting gracefully...")
	case <-webHandler.Quit():
		log.Warn("Received termination request via web service, exiting gracefully...")
	case err := <-webHandler.ListenError():
		log.Errorln("Error starting web server, exiting gracefully:", err)
	}

	log.Info("See you next time!")
	return 0
}
Ejemplo n.º 12
0
// recoverFromCrash is called by loadSeriesMapAndHeads if the persistence
// appears to be dirty after the loading (either because the loading resulted in
// an error or because the persistence was dirty from the start). Not goroutine
// safe. Only call before anything else is running (except index processing
// queue as started by newPersistence).
func (p *persistence) recoverFromCrash(fingerprintToSeries map[model.Fingerprint]*memorySeries) error {
	// TODO(beorn): We need proper tests for the crash recovery.
	log.Warn("Starting crash recovery. Prometheus is inoperational until complete.")
	log.Warn("To avoid crash recovery in the future, shut down Prometheus with SIGTERM or a HTTP POST to /-/quit.")

	fpsSeen := map[model.Fingerprint]struct{}{}
	count := 0
	seriesDirNameFmt := fmt.Sprintf("%%0%dx", seriesDirNameLen)

	// Delete the fingerprint mapping file as it might be stale or
	// corrupt. We'll rebuild the mappings as we go.
	if err := os.RemoveAll(p.mappingsFileName()); err != nil {
		return fmt.Errorf("couldn't remove old fingerprint mapping file %s: %s", p.mappingsFileName(), err)
	}
	// The mappings to rebuild.
	fpm := fpMappings{}

	log.Info("Scanning files.")
	for i := 0; i < 1<<(seriesDirNameLen*4); i++ {
		dirname := filepath.Join(p.basePath, fmt.Sprintf(seriesDirNameFmt, i))
		dir, err := os.Open(dirname)
		if os.IsNotExist(err) {
			continue
		}
		if err != nil {
			return err
		}
		for fis := []os.FileInfo{}; err != io.EOF; fis, err = dir.Readdir(1024) {
			if err != nil {
				dir.Close()
				return err
			}
			for _, fi := range fis {
				fp, ok := p.sanitizeSeries(dirname, fi, fingerprintToSeries, fpm)
				if ok {
					fpsSeen[fp] = struct{}{}
				}
				count++
				if count%10000 == 0 {
					log.Infof("%d files scanned.", count)
				}
			}
		}
		dir.Close()
	}
	log.Infof("File scan complete. %d series found.", len(fpsSeen))

	log.Info("Checking for series without series file.")
	for fp, s := range fingerprintToSeries {
		if _, seen := fpsSeen[fp]; !seen {
			// fp exists in fingerprintToSeries, but has no representation on disk.
			if s.persistWatermark == len(s.chunkDescs) {
				// Oops, everything including the head chunk was
				// already persisted, but nothing on disk.
				// Thus, we lost that series completely. Clean
				// up the remnants.
				delete(fingerprintToSeries, fp)
				if err := p.purgeArchivedMetric(fp); err != nil {
					// Purging the archived metric didn't work, so try
					// to unindex it, just in case it's in the indexes.
					p.unindexMetric(fp, s.metric)
				}
				log.Warnf("Lost series detected: fingerprint %v, metric %v.", fp, s.metric)
				continue
			}
			// If we are here, the only chunks we have are the chunks in the checkpoint.
			// Adjust things accordingly.
			if s.persistWatermark > 0 || s.chunkDescsOffset != 0 {
				minLostChunks := s.persistWatermark + s.chunkDescsOffset
				if minLostChunks <= 0 {
					log.Warnf(
						"Possible loss of chunks for fingerprint %v, metric %v.",
						fp, s.metric,
					)
				} else {
					log.Warnf(
						"Lost at least %d chunks for fingerprint %v, metric %v.",
						minLostChunks, fp, s.metric,
					)
				}
				s.chunkDescs = append(
					make([]*chunk.Desc, 0, len(s.chunkDescs)-s.persistWatermark),
					s.chunkDescs[s.persistWatermark:]...,
				)
				chunk.NumMemDescs.Sub(float64(s.persistWatermark))
				s.persistWatermark = 0
				s.chunkDescsOffset = 0
			}
			maybeAddMapping(fp, s.metric, fpm)
			fpsSeen[fp] = struct{}{} // Add so that fpsSeen is complete.
		}
	}
	log.Info("Check for series without series file complete.")

	if err := p.cleanUpArchiveIndexes(fingerprintToSeries, fpsSeen, fpm); err != nil {
		return err
	}
	if err := p.rebuildLabelIndexes(fingerprintToSeries); err != nil {
		return err
	}
	// Finally rewrite the mappings file if there are any mappings.
	if len(fpm) > 0 {
		if err := p.checkpointFPMappings(fpm); err != nil {
			return err
		}
	}

	p.dirtyMtx.Lock()
	// Only declare storage clean if it didn't become dirty during crash recovery.
	if !p.becameDirty {
		p.dirty = false
	}
	p.dirtyMtx.Unlock()

	log.Warn("Crash recovery complete.")
	return nil
}
Ejemplo n.º 13
0
// Main manages the startup and shutdown lifecycle of the entire Prometheus server.
func Main() int {
	if err := parse(os.Args[1:]); err != nil {
		log.Error(err)
		return 2
	}

	if cfg.printVersion {
		fmt.Fprintln(os.Stdout, version.Print("prometheus"))
		return 0
	}

	log.Infoln("Starting prometheus", version.Info())
	log.Infoln("Build context", version.BuildContext())

	var (
		sampleAppender = storage.Fanout{}
		reloadables    []Reloadable
	)

	var localStorage local.Storage
	switch cfg.localStorageEngine {
	case "persisted":
		localStorage = local.NewMemorySeriesStorage(&cfg.storage)
		sampleAppender = storage.Fanout{localStorage}
	case "none":
		localStorage = &local.NoopStorage{}
	default:
		log.Errorf("Invalid local storage engine %q", cfg.localStorageEngine)
		return 1
	}

	remoteStorage, err := remote.New(&cfg.remote)
	if err != nil {
		log.Errorf("Error initializing remote storage: %s", err)
		return 1
	}
	if remoteStorage != nil {
		sampleAppender = append(sampleAppender, remoteStorage)
		reloadables = append(reloadables, remoteStorage)
	}

	reloadableRemoteStorage := remote.NewConfigurable()
	sampleAppender = append(sampleAppender, reloadableRemoteStorage)
	reloadables = append(reloadables, reloadableRemoteStorage)

	var (
		notifier       = notifier.New(&cfg.notifier)
		targetManager  = retrieval.NewTargetManager(sampleAppender)
		queryEngine    = promql.NewEngine(localStorage, &cfg.queryEngine)
		ctx, cancelCtx = context.WithCancel(context.Background())
	)

	ruleManager := rules.NewManager(&rules.ManagerOptions{
		SampleAppender: sampleAppender,
		Notifier:       notifier,
		QueryEngine:    queryEngine,
		Context:        ctx,
		ExternalURL:    cfg.web.ExternalURL,
	})

	cfg.web.Context = ctx
	cfg.web.Storage = localStorage
	cfg.web.QueryEngine = queryEngine
	cfg.web.TargetManager = targetManager
	cfg.web.RuleManager = ruleManager

	cfg.web.Version = &web.PrometheusVersion{
		Version:   version.Version,
		Revision:  version.Revision,
		Branch:    version.Branch,
		BuildUser: version.BuildUser,
		BuildDate: version.BuildDate,
		GoVersion: version.GoVersion,
	}

	cfg.web.Flags = map[string]string{}
	cfg.fs.VisitAll(func(f *flag.Flag) {
		cfg.web.Flags[f.Name] = f.Value.String()
	})

	webHandler := web.New(&cfg.web)

	reloadables = append(reloadables, targetManager, ruleManager, webHandler, notifier)

	if err := reloadConfig(cfg.configFile, reloadables...); err != nil {
		log.Errorf("Error loading config: %s", err)
		return 1
	}

	// Wait for reload or termination signals. Start the handler for SIGHUP as
	// early as possible, but ignore it until we are ready to handle reloading
	// our config.
	hup := make(chan os.Signal)
	hupReady := make(chan bool)
	signal.Notify(hup, syscall.SIGHUP)
	go func() {
		<-hupReady
		for {
			select {
			case <-hup:
				if err := reloadConfig(cfg.configFile, reloadables...); err != nil {
					log.Errorf("Error reloading config: %s", err)
				}
			case rc := <-webHandler.Reload():
				if err := reloadConfig(cfg.configFile, reloadables...); err != nil {
					log.Errorf("Error reloading config: %s", err)
					rc <- err
				} else {
					rc <- nil
				}
			}
		}
	}()

	// Start all components. The order is NOT arbitrary.

	if err := localStorage.Start(); err != nil {
		log.Errorln("Error opening memory series storage:", err)
		return 1
	}
	defer func() {
		if err := localStorage.Stop(); err != nil {
			log.Errorln("Error stopping storage:", err)
		}
	}()

	if remoteStorage != nil {
		remoteStorage.Start()
		defer remoteStorage.Stop()
	}

	defer reloadableRemoteStorage.Stop()

	// The storage has to be fully initialized before registering.
	if instrumentedStorage, ok := localStorage.(prometheus.Collector); ok {
		prometheus.MustRegister(instrumentedStorage)
	}
	prometheus.MustRegister(notifier)
	prometheus.MustRegister(configSuccess)
	prometheus.MustRegister(configSuccessTime)

	// The notifier is a dependency of the rule manager. It has to be
	// started before and torn down afterwards.
	go notifier.Run()
	defer notifier.Stop()

	go ruleManager.Run()
	defer ruleManager.Stop()

	go targetManager.Run()
	defer targetManager.Stop()

	// Shutting down the query engine before the rule manager will cause pending queries
	// to be canceled and ensures a quick shutdown of the rule manager.
	defer cancelCtx()

	go webHandler.Run()

	// Wait for reload or termination signals.
	close(hupReady) // Unblock SIGHUP handler.

	term := make(chan os.Signal)
	signal.Notify(term, os.Interrupt, syscall.SIGTERM)
	select {
	case <-term:
		log.Warn("Received SIGTERM, exiting gracefully...")
	case <-webHandler.Quit():
		log.Warn("Received termination request via web service, exiting gracefully...")
	case err := <-webHandler.ListenError():
		log.Errorln("Error starting web server, exiting gracefully:", err)
	}

	log.Info("See you next time!")
	return 0
}
Ejemplo n.º 14
0
// loadSeriesMapAndHeads loads the fingerprint to memory-series mapping and all
// the chunks contained in the checkpoint (and thus not yet persisted to series
// files). The method is capable of loading the checkpoint format v1 and v2. If
// recoverable corruption is detected, or if the dirty flag was set from the
// beginning, crash recovery is run, which might take a while. If an
// unrecoverable error is encountered, it is returned. Call this method during
// start-up while nothing else is running in storage land. This method is
// utterly goroutine-unsafe.
func (p *persistence) loadSeriesMapAndHeads() (sm *seriesMap, chunksToPersist int64, err error) {
	var chunkDescsTotal int64
	fingerprintToSeries := make(map[model.Fingerprint]*memorySeries)
	sm = &seriesMap{m: fingerprintToSeries}

	defer func() {
		if sm != nil && p.dirty {
			log.Warn("Persistence layer appears dirty.")
			err = p.recoverFromCrash(fingerprintToSeries)
			if err != nil {
				sm = nil
			}
		}
		if err == nil {
			numMemChunkDescs.Add(float64(chunkDescsTotal))
		}
	}()

	f, err := os.Open(p.headsFileName())
	if os.IsNotExist(err) {
		return sm, 0, nil
	}
	if err != nil {
		log.Warn("Could not open heads file:", err)
		p.dirty = true
		return
	}
	defer f.Close()
	r := bufio.NewReaderSize(f, fileBufSize)

	buf := make([]byte, len(headsMagicString))
	if _, err := io.ReadFull(r, buf); err != nil {
		log.Warn("Could not read from heads file:", err)
		p.dirty = true
		return sm, 0, nil
	}
	magic := string(buf)
	if magic != headsMagicString {
		log.Warnf(
			"unexpected magic string, want %q, got %q",
			headsMagicString, magic,
		)
		p.dirty = true
		return
	}
	version, err := binary.ReadVarint(r)
	if (version != headsFormatVersion && version != headsFormatLegacyVersion) || err != nil {
		log.Warnf("unknown heads format version, want %d", headsFormatVersion)
		p.dirty = true
		return sm, 0, nil
	}
	numSeries, err := codable.DecodeUint64(r)
	if err != nil {
		log.Warn("Could not decode number of series:", err)
		p.dirty = true
		return sm, 0, nil
	}

	for ; numSeries > 0; numSeries-- {
		seriesFlags, err := r.ReadByte()
		if err != nil {
			log.Warn("Could not read series flags:", err)
			p.dirty = true
			return sm, chunksToPersist, nil
		}
		headChunkPersisted := seriesFlags&flagHeadChunkPersisted != 0
		fp, err := codable.DecodeUint64(r)
		if err != nil {
			log.Warn("Could not decode fingerprint:", err)
			p.dirty = true
			return sm, chunksToPersist, nil
		}
		var metric codable.Metric
		if err := metric.UnmarshalFromReader(r); err != nil {
			log.Warn("Could not decode metric:", err)
			p.dirty = true
			return sm, chunksToPersist, nil
		}
		var persistWatermark int64
		var modTime time.Time
		if version != headsFormatLegacyVersion {
			// persistWatermark only present in v2.
			persistWatermark, err = binary.ReadVarint(r)
			if err != nil {
				log.Warn("Could not decode persist watermark:", err)
				p.dirty = true
				return sm, chunksToPersist, nil
			}
			modTimeNano, err := binary.ReadVarint(r)
			if err != nil {
				log.Warn("Could not decode modification time:", err)
				p.dirty = true
				return sm, chunksToPersist, nil
			}
			if modTimeNano != -1 {
				modTime = time.Unix(0, modTimeNano)
			}
		}
		chunkDescsOffset, err := binary.ReadVarint(r)
		if err != nil {
			log.Warn("Could not decode chunk descriptor offset:", err)
			p.dirty = true
			return sm, chunksToPersist, nil
		}
		savedFirstTime, err := binary.ReadVarint(r)
		if err != nil {
			log.Warn("Could not decode saved first time:", err)
			p.dirty = true
			return sm, chunksToPersist, nil
		}
		numChunkDescs, err := binary.ReadVarint(r)
		if err != nil {
			log.Warn("Could not decode number of chunk descriptors:", err)
			p.dirty = true
			return sm, chunksToPersist, nil
		}
		chunkDescs := make([]*chunkDesc, numChunkDescs)
		if version == headsFormatLegacyVersion {
			if headChunkPersisted {
				persistWatermark = numChunkDescs
			} else {
				persistWatermark = numChunkDescs - 1
			}
		}

		for i := int64(0); i < numChunkDescs; i++ {
			if i < persistWatermark {
				firstTime, err := binary.ReadVarint(r)
				if err != nil {
					log.Warn("Could not decode first time:", err)
					p.dirty = true
					return sm, chunksToPersist, nil
				}
				lastTime, err := binary.ReadVarint(r)
				if err != nil {
					log.Warn("Could not decode last time:", err)
					p.dirty = true
					return sm, chunksToPersist, nil
				}
				chunkDescs[i] = &chunkDesc{
					chunkFirstTime: model.Time(firstTime),
					chunkLastTime:  model.Time(lastTime),
				}
				chunkDescsTotal++
			} else {
				// Non-persisted chunk.
				encoding, err := r.ReadByte()
				if err != nil {
					log.Warn("Could not decode chunk type:", err)
					p.dirty = true
					return sm, chunksToPersist, nil
				}
				chunk := newChunkForEncoding(chunkEncoding(encoding))
				if err := chunk.unmarshal(r); err != nil {
					log.Warn("Could not decode chunk:", err)
					p.dirty = true
					return sm, chunksToPersist, nil
				}
				chunkDescs[i] = newChunkDesc(chunk)
				chunksToPersist++
			}
		}

		fingerprintToSeries[model.Fingerprint(fp)] = &memorySeries{
			metric:           model.Metric(metric),
			chunkDescs:       chunkDescs,
			persistWatermark: int(persistWatermark),
			modTime:          modTime,
			chunkDescsOffset: int(chunkDescsOffset),
			savedFirstTime:   model.Time(savedFirstTime),
			lastTime:         chunkDescs[len(chunkDescs)-1].lastTime(),
			headChunkClosed:  persistWatermark >= numChunkDescs,
		}
	}
	return sm, chunksToPersist, nil
}