Exemplo n.º 1
0
// Send queues the given notification requests for processing.
// Panics if called on a handler that is not running.
func (n *Notifier) Send(alerts ...*model.Alert) {
	n.mtx.Lock()
	defer n.mtx.Unlock()

	// Queue capacity should be significantly larger than a single alert
	// batch could be.
	if d := len(alerts) - n.opts.QueueCapacity; d > 0 {
		alerts = alerts[d:]

		log.Warnf("Alert batch larger than queue capacity, dropping %d alerts", d)
		n.dropped.Add(float64(d))
	}

	// If the queue is full, remove the oldest alerts in favor
	// of newer ones.
	if d := (len(n.queue) + len(alerts)) - n.opts.QueueCapacity; d > 0 {
		n.queue = n.queue[d:]

		log.Warnf("Alert notification queue full, dropping %d alerts", d)
		n.dropped.Add(float64(d))
	}
	n.queue = append(n.queue, alerts...)

	// Notify sending goroutine that there are alerts to be processed.
	n.setMore()
}
Exemplo n.º 2
0
// Append implements Storage.
func (s *memorySeriesStorage) Append(sample *model.Sample) {
	for ln, lv := range sample.Metric {
		if len(lv) == 0 {
			delete(sample.Metric, ln)
		}
	}
	if s.getNumChunksToPersist() >= s.maxChunksToPersist {
		log.Warnf(
			"%d chunks waiting for persistence, sample ingestion suspended.",
			s.getNumChunksToPersist(),
		)
		for s.getNumChunksToPersist() >= s.maxChunksToPersist {
			time.Sleep(time.Second)
		}
		log.Warn("Sample ingestion resumed.")
	}
	rawFP := sample.Metric.FastFingerprint()
	s.fpLocker.Lock(rawFP)
	fp, err := s.mapper.mapFP(rawFP, sample.Metric)
	if err != nil {
		log.Errorf("Error while mapping fingerprint %v: %v", rawFP, err)
		s.persistence.setDirty(true)
	}
	if fp != rawFP {
		// Switch locks.
		s.fpLocker.Unlock(rawFP)
		s.fpLocker.Lock(fp)
	}
	series := s.getOrCreateSeries(fp, sample.Metric)

	if sample.Timestamp <= series.lastTime {
		// Don't log and track equal timestamps, as they are a common occurrence
		// when using client-side timestamps (e.g. Pushgateway or federation).
		// It would be even better to also compare the sample values here, but
		// we don't have efficient access to a series's last value.
		if sample.Timestamp != series.lastTime {
			log.Warnf("Ignoring sample with out-of-order timestamp for fingerprint %v (%v): %v is not after %v", fp, series.metric, sample.Timestamp, series.lastTime)
			s.outOfOrderSamplesCount.Inc()
		}
		s.fpLocker.Unlock(fp)
		return
	}
	completedChunksCount := series.add(&model.SamplePair{
		Value:     sample.Value,
		Timestamp: sample.Timestamp,
	})
	s.fpLocker.Unlock(fp)
	s.ingestedSamplesCount.Inc()
	s.incNumChunksToPersist(completedChunksCount)
}
Exemplo n.º 3
0
func (t *StorageQueueManager) sendSamples(s model.Samples) {
	t.sendSemaphore <- true

	go func() {
		defer func() {
			<-t.sendSemaphore
		}()

		// Samples are sent to the remote storage on a best-effort basis. If a
		// sample isn't sent correctly the first time, it's simply dropped on the
		// floor.
		begin := time.Now()
		err := t.tsdb.Store(s)
		duration := time.Since(begin).Seconds()

		labelValue := success
		if err != nil {
			log.Warnf("error sending %d samples to remote storage: %s", len(s), err)
			labelValue = failure
			t.failedBatches.Inc()
			t.failedSamples.Add(float64(len(s)))
		}
		t.samplesCount.WithLabelValues(labelValue).Add(float64(len(s)))
		t.sendLatency.Observe(duration)
	}()
}
Exemplo n.º 4
0
func lookupAll(name string, qtype uint16) (*dns.Msg, error) {
	conf, err := dns.ClientConfigFromFile(resolvConf)
	if err != nil {
		return nil, fmt.Errorf("could not load resolv.conf: %s", err)
	}

	client := &dns.Client{}
	response := &dns.Msg{}

	for _, server := range conf.Servers {
		servAddr := net.JoinHostPort(server, conf.Port)
		for _, suffix := range conf.Search {
			response, err = lookup(name, qtype, client, servAddr, suffix, false)
			if err != nil {
				log.Warnf("resolving %s.%s failed: %s", name, suffix, err)
				continue
			}
			if len(response.Answer) > 0 {
				return response, nil
			}
		}
		response, err = lookup(name, qtype, client, servAddr, "", false)
		if err == nil {
			return response, nil
		}
	}
	return response, fmt.Errorf("could not resolve %s: No server responded", name)
}
Exemplo n.º 5
0
// Run dispatches notifications continuously.
func (n *Notifier) Run() {
	numAMs := len(n.opts.AlertmanagerURLs)
	// Just warn once in the beginning to prevent noisy logs.
	if numAMs == 0 {
		log.Warnf("No AlertManagers configured, not dispatching any alerts")
		return
	}

	for {
		select {
		case <-n.ctx.Done():
			return
		case <-n.more:
		}
		alerts := n.nextBatch()

		if numAMs > 0 {

			if len(alerts) > 0 {
				numErrors := n.sendAll(alerts...)
				// Increment the dropped counter if we could not send
				// successfully to a single AlertManager.
				if numErrors == numAMs {
					n.dropped.Add(float64(len(alerts)))
				}
			}
		} else {
			n.dropped.Add(float64(len(alerts)))
		}
		// If the queue still has items left, kick off the next iteration.
		if n.queueLen() > 0 {
			n.setMore()
		}
	}
}
Exemplo n.º 6
0
// Store sends a batch of samples to Graphite.
func (c *Client) Store(samples model.Samples) error {
	conn, err := net.DialTimeout(c.transport, c.address, c.timeout)
	if err != nil {
		return err
	}
	defer conn.Close()

	var buf bytes.Buffer
	for _, s := range samples {
		k := pathFromMetric(s.Metric, c.prefix)
		t := float64(s.Timestamp.UnixNano()) / 1e9
		v := float64(s.Value)
		if math.IsNaN(v) || math.IsInf(v, 0) {
			log.Warnf("cannot send value %f to Graphite,"+
				"skipping sample %#v", v, s)
			continue
		}
		fmt.Fprintf(&buf, "%s %f %f\n", k, v, t)
	}

	_, err = conn.Write(buf.Bytes())
	if err != nil {
		return err
	}

	return nil
}
Exemplo n.º 7
0
// Store sends a batch of samples to OpenTSDB via its HTTP API.
func (c *Client) Store(samples model.Samples) error {
	reqs := make([]StoreSamplesRequest, 0, len(samples))
	for _, s := range samples {
		v := float64(s.Value)
		if math.IsNaN(v) || math.IsInf(v, 0) {
			log.Warnf("cannot send value %f to OpenTSDB, skipping sample %#v", v, s)
			continue
		}
		metric := TagValue(s.Metric[model.MetricNameLabel])
		reqs = append(reqs, StoreSamplesRequest{
			Metric:    metric,
			Timestamp: s.Timestamp.Unix(),
			Value:     v,
			Tags:      tagsFromMetric(s.Metric),
		})
	}

	u, err := url.Parse(c.url)
	if err != nil {
		return err
	}

	u.Path = putEndpoint

	buf, err := json.Marshal(reqs)
	if err != nil {
		return err
	}

	resp, err := c.httpClient.Post(
		u.String(),
		contentTypeJSON,
		bytes.NewBuffer(buf),
	)
	if err != nil {
		return err
	}
	defer resp.Body.Close()

	// API returns status code 204 for successful writes.
	// http://opentsdb.net/docs/build/html/api_http/put.html
	if resp.StatusCode == http.StatusNoContent {
		return nil
	}

	// API returns status code 400 on error, encoding error details in the
	// response content in JSON.
	buf, err = ioutil.ReadAll(resp.Body)
	if err != nil {
		return err
	}

	var r map[string]int
	if err := json.Unmarshal(buf, &r); err != nil {
		return err
	}
	return fmt.Errorf("failed to write %d samples to OpenTSDB, %d succeeded", r["failed"], r["success"])
}
Exemplo n.º 8
0
func (m *Manager) GetRuleAlertNotifications(rule *AlertingRule, timestamp model.Time) notification.NotificationReqs {
	activeAlerts := rule.ActiveAlerts()
	if len(activeAlerts) == 0 {
		return notification.NotificationReqs{}
	}

	notifications := make(notification.NotificationReqs, 0, len(activeAlerts))
	for _, aa := range activeAlerts {
		if aa.State != StateFiring {
			// BUG: In the future, make AlertManager support pending alerts?
			continue
		}

		// Provide the alert information to the template.
		l := map[string]string{}
		for k, v := range aa.Labels {
			l[string(k)] = string(v)
		}
		tmplData := struct {
			Labels map[string]string
			Value  float64
		}{
			Labels: l,
			Value:  float64(aa.Value),
		}
		// Inject some convenience variables that are easier to remember for users
		// who are not used to Go's templating system.
		defs := "{{$labels := .Labels}}{{$value := .Value}}"

		expand := func(text string) string {
			tmpl := template.NewTemplateExpander(defs+text, "__alert_"+rule.Name(), tmplData, timestamp, m.queryEngine, m.externalURL.Path)
			result, err := tmpl.Expand()
			if err != nil {
				result = err.Error()
				log.Warnf("Error expanding alert template %v with data '%v': %v", rule.Name(), tmplData, err)
			}
			return result
		}

		notifications = append(notifications, &notification.NotificationReq{
			Summary:     expand(rule.summary),
			Description: expand(rule.description),
			Runbook:     rule.runbook,
			Labels: aa.Labels.Merge(model.LabelSet{
				alertNameLabel: model.LabelValue(rule.Name()),
			}),
			Value:        aa.Value,
			ActiveSince:  aa.ActiveSince.Time(),
			RuleString:   rule.String(),
			GeneratorURL: m.externalURL.String() + strutil.GraphLinkForExpression(rule.vector.String()),
		})
	}
	return notifications
}
Exemplo n.º 9
0
// Send queues the given notification requests for processing.
// Panics if called on a handler that is not running.
func (n *Notifier) Send(alerts ...*model.Alert) {
	n.mtx.Lock()
	defer n.mtx.Unlock()

	// Attach external labels before relabelling and sending.
	for _, a := range alerts {
		for ln, lv := range n.opts.ExternalLabels {
			if _, ok := a.Labels[ln]; !ok {
				a.Labels[ln] = lv
			}
		}
	}

	alerts = n.relabelAlerts(alerts)

	// Queue capacity should be significantly larger than a single alert
	// batch could be.
	if d := len(alerts) - n.opts.QueueCapacity; d > 0 {
		alerts = alerts[d:]

		log.Warnf("Alert batch larger than queue capacity, dropping %d alerts", d)
		n.dropped.Add(float64(d))
	}

	// If the queue is full, remove the oldest alerts in favor
	// of newer ones.
	if d := (len(n.queue) + len(alerts)) - n.opts.QueueCapacity; d > 0 {
		n.queue = n.queue[d:]

		log.Warnf("Alert notification queue full, dropping %d alerts", d)
		n.dropped.Add(float64(d))
	}
	n.queue = append(n.queue, alerts...)

	// Notify sending goroutine that there are alerts to be processed.
	n.setMore()
}
Exemplo n.º 10
0
// eval runs a single evaluation cycle in which all rules are evaluated in parallel.
// In the future a single group will be evaluated sequentially to properly handle
// rule dependency.
func (g *Group) eval() {
	var (
		now = model.Now()
		wg  sync.WaitGroup
	)

	for _, rule := range g.rules {
		wg.Add(1)
		// BUG(julius): Look at fixing thundering herd.
		go func(rule Rule) {
			defer wg.Done()

			start := time.Now()
			evalTotal.Inc()

			vector, err := rule.eval(now, g.opts.QueryEngine)
			if err != nil {
				// Canceled queries are intentional termination of queries. This normally
				// happens on shutdown and thus we skip logging of any errors here.
				if _, ok := err.(promql.ErrQueryCanceled); !ok {
					log.Warnf("Error while evaluating rule %q: %s", rule, err)
				}
				evalFailures.Inc()
			}
			var rtyp ruleType

			switch r := rule.(type) {
			case *AlertingRule:
				rtyp = ruleTypeRecording
				g.sendAlerts(r, now)

			case *RecordingRule:
				rtyp = ruleTypeAlert

			default:
				panic(fmt.Errorf("unknown rule type: %T", rule))
			}

			evalDuration.WithLabelValues(string(rtyp)).Observe(
				float64(time.Since(start)) / float64(time.Second),
			)

			for _, s := range vector {
				g.opts.SampleAppender.Append(s)
			}
		}(rule)
	}
	wg.Wait()
}
Exemplo n.º 11
0
// isDegraded returns whether the storage is in "graceful degradation mode",
// which is the case if the number of chunks waiting for persistence has reached
// a percentage of maxChunksToPersist that exceeds
// percentChunksToPersistForDegradation. The method is not goroutine safe (but
// only ever called from the goroutine dealing with series maintenance).
// Changes of degradation mode are logged.
func (s *memorySeriesStorage) isDegraded() bool {
	nowDegraded := s.getNumChunksToPersist() > s.maxChunksToPersist*percentChunksToPersistForDegradation/100
	if s.degraded && !nowDegraded {
		log.Warn("Storage has left graceful degradation mode. Things are back to normal.")
	} else if !s.degraded && nowDegraded {
		log.Warnf(
			"%d chunks waiting for persistence (%d%% of the allowed maximum %d). Storage is now in graceful degradation mode. Series files are not synced anymore if following the adaptive strategy. Checkpoints are not performed more often than every %v. Series maintenance happens as frequently as possible.",
			s.getNumChunksToPersist(),
			s.getNumChunksToPersist()*100/s.maxChunksToPersist,
			s.maxChunksToPersist,
			s.checkpointInterval)
	}
	s.degraded = nowDegraded
	return s.degraded
}
Exemplo n.º 12
0
// maybeAddMapping adds a fingerprint mapping to fpm if the FastFingerprint of m is different from fp.
func maybeAddMapping(fp model.Fingerprint, m model.Metric, fpm fpMappings) {
	if rawFP := m.FastFingerprint(); rawFP != fp {
		log.Warnf(
			"Metric %v with fingerprint %v is mapped from raw fingerprint %v.",
			m, fp, rawFP,
		)
		if mappedFPs, ok := fpm[rawFP]; ok {
			mappedFPs[metricToUniqueString(m)] = fp
		} else {
			fpm[rawFP] = map[string]model.Fingerprint{
				metricToUniqueString(m): fp,
			}
		}
	}
}
Exemplo n.º 13
0
func (sl *scrapeLoop) append(samples model.Samples) {
	numOutOfOrder := 0

	for _, s := range samples {
		if err := sl.appender.Append(s); err != nil {
			if err == local.ErrOutOfOrderSample {
				numOutOfOrder++
			} else {
				log.Warnf("Error inserting sample: %s", err)
			}
		}
	}
	if numOutOfOrder > 0 {
		log.With("numDropped", numOutOfOrder).Warn("Error on ingesting out-of-order samples")
	}
}
Exemplo n.º 14
0
func (t *StorageQueueManager) sendSamples(s model.Samples) {
	// Samples are sent to the remote storage on a best-effort basis. If a
	// sample isn't sent correctly the first time, it's simply dropped on the
	// floor.
	begin := time.Now()
	err := t.tsdb.Store(s)
	duration := time.Since(begin).Seconds()

	labelValue := success
	if err != nil {
		log.Warnf("error sending %d samples to remote storage: %s", len(s), err)
		labelValue = failure
	}
	t.sentSamplesTotal.WithLabelValues(labelValue).Add(float64(len(s)))
	t.sentBatchDuration.WithLabelValues(labelValue).Observe(duration)
}
Exemplo n.º 15
0
func (m *Manager) runIteration() {
	now := model.Now()
	wg := sync.WaitGroup{}

	m.Lock()
	rulesSnapshot := make([]Rule, len(m.rules))
	copy(rulesSnapshot, m.rules)
	m.Unlock()

	for _, rule := range rulesSnapshot {
		wg.Add(1)
		// BUG(julius): Look at fixing thundering herd.
		go func(rule Rule) {
			defer wg.Done()

			start := time.Now()
			vector, err := rule.eval(now, m.queryEngine)
			duration := time.Since(start)

			if err != nil {
				evalFailures.Inc()
				log.Warnf("Error while evaluating rule %q: %s", rule, err)
				return
			}

			switch r := rule.(type) {
			case *AlertingRule:
				m.queueAlertNotifications(r, now)
				evalDuration.WithLabelValues(ruleTypeAlerting).Observe(
					float64(duration / time.Millisecond),
				)
			case *RecordingRule:
				evalDuration.WithLabelValues(ruleTypeRecording).Observe(
					float64(duration / time.Millisecond),
				)
			default:
				panic(fmt.Errorf("unknown rule type: %T", rule))
			}

			for _, s := range vector {
				m.sampleAppender.Append(s)
			}
		}(rule)
	}
	wg.Wait()
}
Exemplo n.º 16
0
func (dd *Discovery) refresh(ctx context.Context, name string, ch chan<- []*config.TargetGroup) error {
	response, err := lookupAll(name, dd.qtype)
	dnsSDLookupsCount.Inc()
	if err != nil {
		dnsSDLookupFailuresCount.Inc()
		return err
	}

	tg := &config.TargetGroup{}
	hostPort := func(a string, p int) model.LabelValue {
		return model.LabelValue(net.JoinHostPort(a, fmt.Sprintf("%d", p)))
	}

	for _, record := range response.Answer {
		target := model.LabelValue("")
		switch addr := record.(type) {
		case *dns.SRV:
			// Remove the final dot from rooted DNS names to make them look more usual.
			addr.Target = strings.TrimRight(addr.Target, ".")

			target = hostPort(addr.Target, int(addr.Port))
		case *dns.A:
			target = hostPort(addr.A.String(), dd.port)
		case *dns.AAAA:
			target = hostPort(addr.AAAA.String(), dd.port)
		default:
			log.Warnf("%q is not a valid SRV record", record)
			continue

		}
		tg.Targets = append(tg.Targets, model.LabelSet{
			model.AddressLabel: target,
			dnsNameLabel:       model.LabelValue(name),
		})
	}

	tg.Source = name
	select {
	case <-ctx.Done():
		return ctx.Err()
	case ch <- []*config.TargetGroup{tg}:
	}

	return nil
}
Exemplo n.º 17
0
// eval runs a single evaluation cycle in which all rules are evaluated in parallel.
// In the future a single group will be evaluated sequentially to properly handle
// rule dependency.
func (g *Group) eval() {
	var (
		now = model.Now()
		wg  sync.WaitGroup
	)

	for _, rule := range g.rules {
		wg.Add(1)
		// BUG(julius): Look at fixing thundering herd.
		go func(rule Rule) {
			defer wg.Done()

			start := time.Now()
			evalTotal.Inc()

			vector, err := rule.eval(now, g.opts.QueryEngine)
			if err != nil {
				evalFailures.Inc()
				log.Warnf("Error while evaluating rule %q: %s", rule, err)
			}
			var rtyp ruleType

			switch r := rule.(type) {
			case *AlertingRule:
				rtyp = ruleTypeRecording
				g.sendAlerts(r, now)

			case *RecordingRule:
				rtyp = ruleTypeAlert

			default:
				panic(fmt.Errorf("unknown rule type: %T", rule))
			}

			evalDuration.WithLabelValues(string(rtyp)).Observe(
				float64(time.Since(start)) / float64(time.Second),
			)

			for _, s := range vector {
				g.opts.SampleAppender.Append(s)
			}
		}(rule)
	}
	wg.Wait()
}
Exemplo n.º 18
0
// GetMetricFamilies implements the MetricStore interface.
func (dms *DiskMetricStore) GetMetricFamilies() []*dto.MetricFamily {
	result := []*dto.MetricFamily{}
	mfStatByName := map[string]mfStat{}

	dms.lock.RLock()
	defer dms.lock.RUnlock()

	for _, group := range dms.metricGroups {
		for name, tmf := range group.Metrics {
			mf := tmf.MetricFamily
			stat, exists := mfStatByName[name]
			if exists {
				existingMF := result[stat.pos]
				if !stat.copied {
					mfStatByName[name] = mfStat{
						pos:    stat.pos,
						copied: true,
					}
					existingMF = copyMetricFamily(existingMF)
					result[stat.pos] = existingMF
				}
				if mf.GetHelp() != existingMF.GetHelp() || mf.GetType() != existingMF.GetType() {
					log.Warnf(
						"Metric families '%s' and '%s' are inconsistent, help and type of the latter will have priority. This is bad. Fix your pushed metrics!",
						mf, existingMF,
					)
				}
				for _, metric := range mf.Metric {
					existingMF.Metric = append(existingMF.Metric, metric)
				}
			} else {
				mfStatByName[name] = mfStat{
					pos:    len(result),
					copied: false,
				}
				result = append(result, mf)
			}
		}
	}
	return result
}
Exemplo n.º 19
0
// eval runs a single evaluation cycle in which all rules are evaluated in parallel.
// In the future a single group will be evaluated sequentially to properly handle
// rule dependency.
func (g *Group) eval() {
	var (
		now = model.Now()
		wg  sync.WaitGroup
	)

	for _, rule := range g.rules {
		rtyp := string(typeForRule(rule))

		wg.Add(1)
		// BUG(julius): Look at fixing thundering herd.
		go func(rule Rule) {
			defer wg.Done()

			defer func(t time.Time) {
				evalDuration.WithLabelValues(rtyp).Observe(float64(time.Since(t)) / float64(time.Second))
			}(time.Now())

			evalTotal.WithLabelValues(rtyp).Inc()

			vector, err := rule.eval(now, g.opts.QueryEngine)
			if err != nil {
				// Canceled queries are intentional termination of queries. This normally
				// happens on shutdown and thus we skip logging of any errors here.
				if _, ok := err.(promql.ErrQueryCanceled); !ok {
					log.Warnf("Error while evaluating rule %q: %s", rule, err)
				}
				evalFailures.WithLabelValues(rtyp).Inc()
				return
			}

			if ar, ok := rule.(*AlertingRule); ok {
				g.sendAlerts(ar, now)
			}
			for _, s := range vector {
				g.opts.SampleAppender.Append(s)
			}
		}(rule)
	}
	wg.Wait()
}
Exemplo n.º 20
0
// Run dispatches notifications continuously.
func (n *Handler) Run() {
	// Just warn once in the beginning to prevent noisy logs.
	if n.opts.AlertmanagerURL == "" {
		log.Warnf("No AlertManager configured, not dispatching any alerts")
	}

	for {
		select {
		case <-n.ctx.Done():
			return
		case <-n.more:
		}

		alerts := n.nextBatch()

		if len(alerts) == 0 {
			continue
		}
		if n.opts.AlertmanagerURL == "" {
			n.dropped.Add(float64(len(alerts)))
			continue
		}

		begin := time.Now()

		if err := n.send(alerts...); err != nil {
			log.Errorf("Error sending %d alerts: %s", len(alerts), err)
			n.errors.Inc()
			n.dropped.Add(float64(len(alerts)))
		}

		n.latency.Observe(float64(time.Since(begin)) / float64(time.Second))
		n.sent.Add(float64(len(alerts)))

		// If the queue still has items left, kick off the next iteration.
		if n.queueLen() > 0 {
			n.setMore()
		}
	}
}
Exemplo n.º 21
0
// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (c *ScrapeConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
	*c = DefaultScrapeConfig
	type plain ScrapeConfig
	err := unmarshal((*plain)(c))
	if err != nil {
		return err
	}
	if err := checkOverflow(c.XXX, "scrape_config"); err != nil {
		return err
	}
	if !patJobName.MatchString(c.JobName) {
		return fmt.Errorf("%q is not a valid job name", c.JobName)
	}
	if len(c.BearerToken) > 0 && len(c.BearerTokenFile) > 0 {
		return fmt.Errorf("at most one of bearer_token & bearer_token_file must be configured")
	}
	if c.BasicAuth != nil && (len(c.BearerToken) > 0 || len(c.BearerTokenFile) > 0) {
		return fmt.Errorf("at most one of basic_auth, bearer_token & bearer_token_file must be configured")
	}
	// Check `target_groups` deprecation.
	if c.TargetGroups != nil && c.StaticConfigs != nil {
		return fmt.Errorf("'target_groups' is deprecated, configure static targets via 'static_configs' only")
	}
	if c.TargetGroups != nil {
		log.Warnf("The 'target_groups' option for scrape configurations is deprecated, use 'static_configs' instead")
		c.StaticConfigs = c.TargetGroups
	}
	// Check for users putting URLs in target groups.
	if len(c.RelabelConfigs) == 0 {
		for _, tg := range c.StaticConfigs {
			for _, t := range tg.Targets {
				if err = CheckTargetAddress(t[model.AddressLabel]); err != nil {
					return err
				}
			}
		}
	}
	return nil
}
Exemplo n.º 22
0
func (dd *DNSDiscovery) refresh(name string, ch chan<- []*config.TargetGroup) error {
	response, err := lookupAll(name, dd.qtype)
	dnsSDLookupsCount.Inc()
	if err != nil {
		dnsSDLookupFailuresCount.Inc()
		return err
	}

	tg := &config.TargetGroup{}

	for _, record := range response.Answer {
		target := model.LabelValue("")
		switch addr := record.(type) {
		case *dns.SRV:
			// Remove the final dot from rooted DNS names to make them look more usual.
			addr.Target = strings.TrimRight(addr.Target, ".")

			target = model.LabelValue(fmt.Sprintf("%s:%d", addr.Target, addr.Port))
		case *dns.A:
			target = model.LabelValue(fmt.Sprintf("%s:%d", addr.A, dd.port))
		case *dns.AAAA:
			target = model.LabelValue(fmt.Sprintf("%s:%d", addr.AAAA, dd.port))
		default:
			log.Warnf("%q is not a valid SRV record", record)
			continue

		}
		tg.Targets = append(tg.Targets, model.LabelSet{
			model.AddressLabel: target,
			dnsNameLabel:       model.LabelValue(name),
		})
	}

	tg.Source = name
	ch <- []*config.TargetGroup{tg}

	return nil
}
Exemplo n.º 23
0
// Notify calls the underlying notifier with exponential backoff until it succeeds.
// It aborts if the context is canceled or timed out.
func (n *RetryNotifier) Notify(ctx context.Context, alerts ...*types.Alert) error {
	var (
		i    = 0
		b    = backoff.NewExponentialBackOff()
		tick = backoff.NewTicker(b)
	)
	defer tick.Stop()

	for {
		i++

		select {
		case <-tick.C:
			if err := n.notifier.Notify(ctx, alerts...); err != nil {
				log.Warnf("Notify attempt %d failed: %s", i, err)
			} else {
				return nil
			}
		case <-ctx.Done():
			return ctx.Err()
		}
	}
}
Exemplo n.º 24
0
// sanitizeSeries sanitizes a series based on its series file as defined by the
// provided directory and FileInfo.  The method returns the fingerprint as
// derived from the directory and file name, and whether the provided file has
// been sanitized. A file that failed to be sanitized is moved into the
// "orphaned" sub-directory, if possible.
//
// The following steps are performed:
//
// - A file whose name doesn't comply with the naming scheme of a series file is
//   simply moved into the orphaned directory.
//
// - If the size of the series file isn't a multiple of the chunk size,
//   extraneous bytes are truncated.  If the truncation fails, the file is
//   moved into the orphaned directory.
//
// - A file that is empty (after truncation) is deleted.
//
// - A series that is not archived (i.e. it is in the fingerprintToSeries map)
//   is checked for consistency of its various parameters (like persist
//   watermark, offset of chunkDescs etc.). In particular, overlap between an
//   in-memory head chunk with the most recent persisted chunk is
//   checked. Inconsistencies are rectified.
//
// - A series that is archived (i.e. it is not in the fingerprintToSeries map)
//   is checked for its presence in the index of archived series. If it cannot
//   be found there, it is moved into the orphaned directory.
func (p *persistence) sanitizeSeries(
	dirname string, fi os.FileInfo,
	fingerprintToSeries map[model.Fingerprint]*memorySeries,
	fpm fpMappings,
) (model.Fingerprint, bool) {
	var (
		fp       model.Fingerprint
		err      error
		filename = filepath.Join(dirname, fi.Name())
		s        *memorySeries
	)

	purge := func() {
		if fp != 0 {
			var metric model.Metric
			if s != nil {
				metric = s.metric
			}
			if err = p.quarantineSeriesFile(
				fp, errors.New("purge during crash recovery"), metric,
			); err == nil {
				return
			}
			log.
				With("file", filename).
				With("error", err).
				Error("Failed to move lost series file to orphaned directory.")
		}
		// If we are here, we are either purging an incorrectly named
		// file, or quarantining has failed. So simply delete the file.
		if err = os.Remove(filename); err != nil {
			log.
				With("file", filename).
				With("error", err).
				Error("Failed to delete lost series file.")
		}
	}

	if len(fi.Name()) != fpLen-seriesDirNameLen+len(seriesFileSuffix) ||
		!strings.HasSuffix(fi.Name(), seriesFileSuffix) {
		log.Warnf("Unexpected series file name %s.", filename)
		purge()
		return fp, false
	}
	if fp, err = model.FingerprintFromString(filepath.Base(dirname) + fi.Name()[:fpLen-seriesDirNameLen]); err != nil {
		log.Warnf("Error parsing file name %s: %s", filename, err)
		purge()
		return fp, false
	}

	bytesToTrim := fi.Size() % int64(chunkLenWithHeader)
	chunksInFile := int(fi.Size()) / chunkLenWithHeader
	modTime := fi.ModTime()
	if bytesToTrim != 0 {
		log.Warnf(
			"Truncating file %s to exactly %d chunks, trimming %d extraneous bytes.",
			filename, chunksInFile, bytesToTrim,
		)
		f, err := os.OpenFile(filename, os.O_WRONLY, 0640)
		if err != nil {
			log.Errorf("Could not open file %s: %s", filename, err)
			purge()
			return fp, false
		}
		if err := f.Truncate(fi.Size() - bytesToTrim); err != nil {
			log.Errorf("Failed to truncate file %s: %s", filename, err)
			purge()
			return fp, false
		}
	}
	if chunksInFile == 0 {
		log.Warnf("No chunks left in file %s.", filename)
		purge()
		return fp, false
	}

	s, ok := fingerprintToSeries[fp]
	if ok { // This series is supposed to not be archived.
		if s == nil {
			panic("fingerprint mapped to nil pointer")
		}
		maybeAddMapping(fp, s.metric, fpm)
		if !p.pedanticChecks &&
			bytesToTrim == 0 &&
			s.chunkDescsOffset != -1 &&
			chunksInFile == s.chunkDescsOffset+s.persistWatermark &&
			modTime.Equal(s.modTime) {
			// Everything is consistent. We are good.
			return fp, true
		}
		// If we are here, we cannot be sure the series file is
		// consistent with the checkpoint, so we have to take a closer
		// look.
		if s.headChunkClosed {
			// This is the easy case as we have all chunks on
			// disk. Treat this series as a freshly unarchived one
			// by loading the chunkDescs and setting all parameters
			// based on the loaded chunkDescs.
			cds, err := p.loadChunkDescs(fp, 0)
			if err != nil {
				log.Errorf(
					"Failed to load chunk descriptors for metric %v, fingerprint %v: %s",
					s.metric, fp, err,
				)
				purge()
				return fp, false
			}
			log.Warnf(
				"Treating recovered metric %v, fingerprint %v, as freshly unarchived, with %d chunks in series file.",
				s.metric, fp, len(cds),
			)
			s.chunkDescs = cds
			s.chunkDescsOffset = 0
			s.savedFirstTime = cds[0].FirstTime()
			s.lastTime, err = cds[len(cds)-1].LastTime()
			if err != nil {
				log.Errorf(
					"Failed to determine time of the last sample for metric %v, fingerprint %v: %s",
					s.metric, fp, err,
				)
				purge()
				return fp, false
			}
			s.persistWatermark = len(cds)
			s.modTime = modTime
			return fp, true
		}
		// This is the tricky one: We have chunks from heads.db, but
		// some of those chunks might already be in the series
		// file. Strategy: Take the last time of the most recent chunk
		// in the series file. Then find the oldest chunk among those
		// from heads.db that has a first time later or equal to the
		// last time from the series file. Throw away the older chunks
		// from heads.db and stitch the parts together.

		// First, throw away the chunkDescs without chunks.
		s.chunkDescs = s.chunkDescs[s.persistWatermark:]
		chunk.NumMemDescs.Sub(float64(s.persistWatermark))
		cds, err := p.loadChunkDescs(fp, 0)
		if err != nil {
			log.Errorf(
				"Failed to load chunk descriptors for metric %v, fingerprint %v: %s",
				s.metric, fp, err,
			)
			purge()
			return fp, false
		}
		s.persistWatermark = len(cds)
		s.chunkDescsOffset = 0
		s.savedFirstTime = cds[0].FirstTime()
		s.modTime = modTime

		lastTime, err := cds[len(cds)-1].LastTime()
		if err != nil {
			log.Errorf(
				"Failed to determine time of the last sample for metric %v, fingerprint %v: %s",
				s.metric, fp, err,
			)
			purge()
			return fp, false
		}
		keepIdx := -1
		for i, cd := range s.chunkDescs {
			if cd.FirstTime() >= lastTime {
				keepIdx = i
				break
			}
		}
		if keepIdx == -1 {
			log.Warnf(
				"Recovered metric %v, fingerprint %v: all %d chunks recovered from series file.",
				s.metric, fp, chunksInFile,
			)
			chunk.NumMemDescs.Sub(float64(len(s.chunkDescs)))
			atomic.AddInt64(&chunk.NumMemChunks, int64(-len(s.chunkDescs)))
			s.chunkDescs = cds
			s.headChunkClosed = true
			return fp, true
		}
		log.Warnf(
			"Recovered metric %v, fingerprint %v: recovered %d chunks from series file, recovered %d chunks from checkpoint.",
			s.metric, fp, chunksInFile, len(s.chunkDescs)-keepIdx,
		)
		chunk.NumMemDescs.Sub(float64(keepIdx))
		atomic.AddInt64(&chunk.NumMemChunks, int64(-keepIdx))
		if keepIdx == len(s.chunkDescs) {
			// No chunks from series file left, head chunk is evicted, so declare it closed.
			s.headChunkClosed = true
		}
		s.chunkDescs = append(cds, s.chunkDescs[keepIdx:]...)
		return fp, true
	}
	// This series is supposed to be archived.
	metric, err := p.archivedMetric(fp)
	if err != nil {
		log.Errorf(
			"Fingerprint %v assumed archived but couldn't be looked up in archived index: %s",
			fp, err,
		)
		purge()
		return fp, false
	}
	if metric == nil {
		log.Warnf(
			"Fingerprint %v assumed archived but couldn't be found in archived index.",
			fp,
		)
		purge()
		return fp, false
	}
	// This series looks like a properly archived one.
	maybeAddMapping(fp, metric, fpm)
	return fp, true
}
Exemplo n.º 25
0
// alertStmt parses an alert rule.
//
//		ALERT name IF expr [FOR duration]
//			[LABELS label_set]
//			[ANNOTATIONS label_set]
//
func (p *parser) alertStmt() *AlertStmt {
	const ctx = "alert statement"

	p.expect(itemAlert, ctx)
	name := p.expect(itemIdentifier, ctx)
	// Alerts require a vector typed expression.
	p.expect(itemIf, ctx)
	expr := p.expr()

	// Optional for clause.
	var (
		duration time.Duration
		err      error
	)
	if p.peek().typ == itemFor {
		p.next()
		dur := p.expect(itemDuration, ctx)
		duration, err = parseDuration(dur.val)
		if err != nil {
			p.error(err)
		}
	}

	// Accepting WITH instead of LABELS is temporary compatibility
	// with the old alerting syntax.
	var (
		hasLabels   bool
		oldSyntax   bool
		labels      = model.LabelSet{}
		annotations = model.LabelSet{}
	)
	if t := p.peek().typ; t == itemLabels {
		p.expect(itemLabels, ctx)
		labels = p.labelSet()
		hasLabels = true
	} else if t == itemWith {
		p.expect(itemWith, ctx)
		labels = p.labelSet()
		oldSyntax = true
	}

	// Only allow old annotation syntax if new label syntax isn't used.
	if !hasLabels {
	Loop:
		for {
			switch p.next().typ {
			case itemSummary:
				annotations["summary"] = model.LabelValue(p.unquoteString(p.expect(itemString, ctx).val))

			case itemDescription:
				annotations["description"] = model.LabelValue(p.unquoteString(p.expect(itemString, ctx).val))

			case itemRunbook:
				annotations["runbook"] = model.LabelValue(p.unquoteString(p.expect(itemString, ctx).val))

			default:
				p.backup()
				break Loop
			}
		}
		if len(annotations) > 0 {
			oldSyntax = true
		}
	}

	// Only allow new annotation syntax if WITH or old annotation
	// syntax weren't used.
	if !oldSyntax {
		if p.peek().typ == itemAnnotations {
			p.expect(itemAnnotations, ctx)
			annotations = p.labelSet()
		}
	} else {
		log.Warnf("Alerting rule with old syntax found. Support for this syntax will be removed with 0.18. Please update to the new syntax.")
	}

	return &AlertStmt{
		Name:        name.val,
		Expr:        expr,
		Duration:    duration,
		Labels:      labels,
		Annotations: annotations,
	}
}
Exemplo n.º 26
0
// eval runs a single evaluation cycle in which all rules are evaluated in parallel.
// In the future a single group will be evaluated sequentially to properly handle
// rule dependency.
func (g *Group) eval() {
	var (
		now = model.Now()
		wg  sync.WaitGroup
	)

	for _, rule := range g.rules {
		rtyp := string(typeForRule(rule))

		wg.Add(1)
		// BUG(julius): Look at fixing thundering herd.
		go func(rule Rule) {
			defer wg.Done()

			defer func(t time.Time) {
				evalDuration.WithLabelValues(rtyp).Observe(time.Since(t).Seconds())
			}(time.Now())

			evalTotal.WithLabelValues(rtyp).Inc()

			vector, err := rule.eval(g.opts.Context, now, g.opts.QueryEngine, g.opts.ExternalURL.Path)
			if err != nil {
				// Canceled queries are intentional termination of queries. This normally
				// happens on shutdown and thus we skip logging of any errors here.
				if _, ok := err.(promql.ErrQueryCanceled); !ok {
					log.Warnf("Error while evaluating rule %q: %s", rule, err)
				}
				evalFailures.WithLabelValues(rtyp).Inc()
				return
			}

			if ar, ok := rule.(*AlertingRule); ok {
				g.sendAlerts(ar, now)
			}
			var (
				numOutOfOrder = 0
				numDuplicates = 0
			)
			for _, s := range vector {
				if err := g.opts.SampleAppender.Append(s); err != nil {
					switch err {
					case local.ErrOutOfOrderSample:
						numOutOfOrder++
						log.With("sample", s).With("error", err).Debug("Rule evaluation result discarded")
					case local.ErrDuplicateSampleForTimestamp:
						numDuplicates++
						log.With("sample", s).With("error", err).Debug("Rule evaluation result discarded")
					default:
						log.With("sample", s).With("error", err).Warn("Rule evaluation result discarded")
					}
				}
			}
			if numOutOfOrder > 0 {
				log.With("numDropped", numOutOfOrder).Warn("Error on ingesting out-of-order result from rule evaluation")
			}
			if numDuplicates > 0 {
				log.With("numDropped", numDuplicates).Warn("Error on ingesting results from rule evaluation with different value but same timestamp")
			}
		}(rule)
	}
	wg.Wait()
}
Exemplo n.º 27
0
// eval evaluates the rule expression and then creates pending alerts and fires
// or removes previously pending alerts accordingly.
func (r *AlertingRule) eval(ts model.Time, engine *promql.Engine, externalURLPath string) (model.Vector, error) {
	query, err := engine.NewInstantQuery(r.vector.String(), ts)
	if err != nil {
		return nil, err
	}
	res, err := query.Exec().Vector()
	if err != nil {
		return nil, err
	}

	r.mtx.Lock()
	defer r.mtx.Unlock()

	// Create pending alerts for any new vector elements in the alert expression
	// or update the expression value for existing elements.
	resultFPs := map[model.Fingerprint]struct{}{}

	for _, smpl := range res {
		// Provide the alert information to the template.
		l := make(map[string]string, len(smpl.Metric))
		for k, v := range smpl.Metric {
			l[string(k)] = string(v)
		}

		tmplData := struct {
			Labels map[string]string
			Value  float64
		}{
			Labels: l,
			Value:  float64(smpl.Value),
		}
		// Inject some convenience variables that are easier to remember for users
		// who are not used to Go's templating system.
		defs := "{{$labels := .Labels}}{{$value := .Value}}"

		expand := func(text model.LabelValue) model.LabelValue {
			tmpl := template.NewTemplateExpander(
				defs+string(text),
				"__alert_"+r.Name(),
				tmplData,
				ts,
				engine,
				externalURLPath,
			)
			result, err := tmpl.Expand()
			if err != nil {
				result = fmt.Sprintf("<error expanding template: %s>", err)
				log.Warnf("Error expanding alert template %v with data '%v': %s", r.Name(), tmplData, err)
			}
			return model.LabelValue(result)
		}

		labels := make(model.LabelSet, len(smpl.Metric)+len(r.labels)+1)
		for ln, lv := range smpl.Metric {
			labels[ln] = lv
		}
		for ln, lv := range r.labels {
			labels[ln] = expand(lv)
		}
		labels[model.AlertNameLabel] = model.LabelValue(r.Name())

		annotations := make(model.LabelSet, len(r.annotations))
		for an, av := range r.annotations {
			annotations[an] = expand(av)
		}
		fp := smpl.Metric.Fingerprint()
		resultFPs[fp] = struct{}{}

		if alert, ok := r.active[fp]; ok && alert.State != StateInactive {
			alert.Value = smpl.Value
			continue
		}

		delete(smpl.Metric, model.MetricNameLabel)

		r.active[fp] = &Alert{
			Labels:      labels,
			Annotations: annotations,
			ActiveAt:    ts,
			State:       StatePending,
			Value:       smpl.Value,
		}
	}

	var vec model.Vector
	// Check if any pending alerts should be removed or fire now. Write out alert timeseries.
	for fp, a := range r.active {
		if _, ok := resultFPs[fp]; !ok {
			if a.State != StateInactive {
				vec = append(vec, r.sample(a, ts, false))
			}
			// If the alert was previously firing, keep it around for a given
			// retention time so it is reported as resolved to the AlertManager.
			if a.State == StatePending || (a.ResolvedAt != 0 && ts.Sub(a.ResolvedAt) > resolvedRetention) {
				delete(r.active, fp)
			}
			if a.State != StateInactive {
				a.State = StateInactive
				a.ResolvedAt = ts
			}
			continue
		}

		if a.State == StatePending && ts.Sub(a.ActiveAt) >= r.holdDuration {
			vec = append(vec, r.sample(a, ts, false))
			a.State = StateFiring
		}

		vec = append(vec, r.sample(a, ts, true))
	}

	return vec, nil
}
Exemplo n.º 28
0
// recoverFromCrash is called by loadSeriesMapAndHeads if the persistence
// appears to be dirty after the loading (either because the loading resulted in
// an error or because the persistence was dirty from the start). Not goroutine
// safe. Only call before anything else is running (except index processing
// queue as started by newPersistence).
func (p *persistence) recoverFromCrash(fingerprintToSeries map[model.Fingerprint]*memorySeries) error {
	// TODO(beorn): We need proper tests for the crash recovery.
	log.Warn("Starting crash recovery. Prometheus is inoperational until complete.")
	log.Warn("To avoid crash recovery in the future, shut down Prometheus with SIGTERM or a HTTP POST to /-/quit.")

	fpsSeen := map[model.Fingerprint]struct{}{}
	count := 0
	seriesDirNameFmt := fmt.Sprintf("%%0%dx", seriesDirNameLen)

	// Delete the fingerprint mapping file as it might be stale or
	// corrupt. We'll rebuild the mappings as we go.
	if err := os.RemoveAll(p.mappingsFileName()); err != nil {
		return fmt.Errorf("couldn't remove old fingerprint mapping file %s: %s", p.mappingsFileName(), err)
	}
	// The mappings to rebuild.
	fpm := fpMappings{}

	log.Info("Scanning files.")
	for i := 0; i < 1<<(seriesDirNameLen*4); i++ {
		dirname := filepath.Join(p.basePath, fmt.Sprintf(seriesDirNameFmt, i))
		dir, err := os.Open(dirname)
		if os.IsNotExist(err) {
			continue
		}
		if err != nil {
			return err
		}
		for fis := []os.FileInfo{}; err != io.EOF; fis, err = dir.Readdir(1024) {
			if err != nil {
				dir.Close()
				return err
			}
			for _, fi := range fis {
				fp, ok := p.sanitizeSeries(dirname, fi, fingerprintToSeries, fpm)
				if ok {
					fpsSeen[fp] = struct{}{}
				}
				count++
				if count%10000 == 0 {
					log.Infof("%d files scanned.", count)
				}
			}
		}
		dir.Close()
	}
	log.Infof("File scan complete. %d series found.", len(fpsSeen))

	log.Info("Checking for series without series file.")
	for fp, s := range fingerprintToSeries {
		if _, seen := fpsSeen[fp]; !seen {
			// fp exists in fingerprintToSeries, but has no representation on disk.
			if s.persistWatermark == len(s.chunkDescs) {
				// Oops, everything including the head chunk was
				// already persisted, but nothing on disk.
				// Thus, we lost that series completely. Clean
				// up the remnants.
				delete(fingerprintToSeries, fp)
				if err := p.purgeArchivedMetric(fp); err != nil {
					// Purging the archived metric didn't work, so try
					// to unindex it, just in case it's in the indexes.
					p.unindexMetric(fp, s.metric)
				}
				log.Warnf("Lost series detected: fingerprint %v, metric %v.", fp, s.metric)
				continue
			}
			// If we are here, the only chunks we have are the chunks in the checkpoint.
			// Adjust things accordingly.
			if s.persistWatermark > 0 || s.chunkDescsOffset != 0 {
				minLostChunks := s.persistWatermark + s.chunkDescsOffset
				if minLostChunks <= 0 {
					log.Warnf(
						"Possible loss of chunks for fingerprint %v, metric %v.",
						fp, s.metric,
					)
				} else {
					log.Warnf(
						"Lost at least %d chunks for fingerprint %v, metric %v.",
						minLostChunks, fp, s.metric,
					)
				}
				s.chunkDescs = append(
					make([]*chunk.Desc, 0, len(s.chunkDescs)-s.persistWatermark),
					s.chunkDescs[s.persistWatermark:]...,
				)
				chunk.NumMemDescs.Sub(float64(s.persistWatermark))
				s.persistWatermark = 0
				s.chunkDescsOffset = 0
			}
			maybeAddMapping(fp, s.metric, fpm)
			fpsSeen[fp] = struct{}{} // Add so that fpsSeen is complete.
		}
	}
	log.Info("Check for series without series file complete.")

	if err := p.cleanUpArchiveIndexes(fingerprintToSeries, fpsSeen, fpm); err != nil {
		return err
	}
	if err := p.rebuildLabelIndexes(fingerprintToSeries); err != nil {
		return err
	}
	// Finally rewrite the mappings file if there are any mappings.
	if len(fpm) > 0 {
		if err := p.checkpointFPMappings(fpm); err != nil {
			return err
		}
	}

	p.dirtyMtx.Lock()
	// Only declare storage clean if it didn't become dirty during crash recovery.
	if !p.becameDirty {
		p.dirty = false
	}
	p.dirtyMtx.Unlock()

	log.Warn("Crash recovery complete.")
	return nil
}
Exemplo n.º 29
0
func warnDeprecated(collector string) {
	log.Warnf("The %s collector is deprecated and will be removed in the future!", collector)
}
Exemplo n.º 30
0
func (p *persistence) cleanUpArchiveIndexes(
	fpToSeries map[model.Fingerprint]*memorySeries,
	fpsSeen map[model.Fingerprint]struct{},
	fpm fpMappings,
) error {
	log.Info("Cleaning up archive indexes.")
	var fp codable.Fingerprint
	var m codable.Metric
	count := 0
	if err := p.archivedFingerprintToMetrics.ForEach(func(kv index.KeyValueAccessor) error {
		count++
		if count%10000 == 0 {
			log.Infof("%d archived metrics checked.", count)
		}
		if err := kv.Key(&fp); err != nil {
			return err
		}
		_, fpSeen := fpsSeen[model.Fingerprint(fp)]
		inMemory := false
		if fpSeen {
			_, inMemory = fpToSeries[model.Fingerprint(fp)]
		}
		if !fpSeen || inMemory {
			if inMemory {
				log.Warnf("Archive clean-up: Fingerprint %v is not archived. Purging from archive indexes.", model.Fingerprint(fp))
			}
			if !fpSeen {
				log.Warnf("Archive clean-up: Fingerprint %v is unknown. Purging from archive indexes.", model.Fingerprint(fp))
			}
			// It's fine if the fp is not in the archive indexes.
			if _, err := p.archivedFingerprintToMetrics.Delete(fp); err != nil {
				return err
			}
			// Delete from timerange index, too.
			_, err := p.archivedFingerprintToTimeRange.Delete(fp)
			return err
		}
		// fp is legitimately archived. Now we need the metric to check for a mapped fingerprint.
		if err := kv.Value(&m); err != nil {
			return err
		}
		maybeAddMapping(model.Fingerprint(fp), model.Metric(m), fpm)
		// Make sure it is in timerange index, too.
		has, err := p.archivedFingerprintToTimeRange.Has(fp)
		if err != nil {
			return err
		}
		if has {
			return nil // All good.
		}
		log.Warnf("Archive clean-up: Fingerprint %v is not in time-range index. Unarchiving it for recovery.")
		// Again, it's fine if fp is not in the archive index.
		if _, err := p.archivedFingerprintToMetrics.Delete(fp); err != nil {
			return err
		}
		cds, err := p.loadChunkDescs(model.Fingerprint(fp), 0)
		if err != nil {
			return err
		}
		series, err := newMemorySeries(model.Metric(m), cds, p.seriesFileModTime(model.Fingerprint(fp)))
		if err != nil {
			return err
		}
		fpToSeries[model.Fingerprint(fp)] = series
		return nil
	}); err != nil {
		return err
	}
	count = 0
	if err := p.archivedFingerprintToTimeRange.ForEach(func(kv index.KeyValueAccessor) error {
		count++
		if count%10000 == 0 {
			log.Infof("%d archived time ranges checked.", count)
		}
		if err := kv.Key(&fp); err != nil {
			return err
		}
		has, err := p.archivedFingerprintToMetrics.Has(fp)
		if err != nil {
			return err
		}
		if has {
			return nil // All good.
		}
		log.Warnf("Archive clean-up: Purging unknown fingerprint %v in time-range index.", fp)
		deleted, err := p.archivedFingerprintToTimeRange.Delete(fp)
		if err != nil {
			return err
		}
		if !deleted {
			log.Errorf("Fingerprint %v to be deleted from archivedFingerprintToTimeRange not found. This should never happen.", fp)
		}
		return nil
	}); err != nil {
		return err
	}
	log.Info("Clean-up of archive indexes complete.")
	return nil
}