// Send queues the given notification requests for processing. // Panics if called on a handler that is not running. func (n *Notifier) Send(alerts ...*model.Alert) { n.mtx.Lock() defer n.mtx.Unlock() // Queue capacity should be significantly larger than a single alert // batch could be. if d := len(alerts) - n.opts.QueueCapacity; d > 0 { alerts = alerts[d:] log.Warnf("Alert batch larger than queue capacity, dropping %d alerts", d) n.dropped.Add(float64(d)) } // If the queue is full, remove the oldest alerts in favor // of newer ones. if d := (len(n.queue) + len(alerts)) - n.opts.QueueCapacity; d > 0 { n.queue = n.queue[d:] log.Warnf("Alert notification queue full, dropping %d alerts", d) n.dropped.Add(float64(d)) } n.queue = append(n.queue, alerts...) // Notify sending goroutine that there are alerts to be processed. n.setMore() }
// Append implements Storage. func (s *memorySeriesStorage) Append(sample *model.Sample) { for ln, lv := range sample.Metric { if len(lv) == 0 { delete(sample.Metric, ln) } } if s.getNumChunksToPersist() >= s.maxChunksToPersist { log.Warnf( "%d chunks waiting for persistence, sample ingestion suspended.", s.getNumChunksToPersist(), ) for s.getNumChunksToPersist() >= s.maxChunksToPersist { time.Sleep(time.Second) } log.Warn("Sample ingestion resumed.") } rawFP := sample.Metric.FastFingerprint() s.fpLocker.Lock(rawFP) fp, err := s.mapper.mapFP(rawFP, sample.Metric) if err != nil { log.Errorf("Error while mapping fingerprint %v: %v", rawFP, err) s.persistence.setDirty(true) } if fp != rawFP { // Switch locks. s.fpLocker.Unlock(rawFP) s.fpLocker.Lock(fp) } series := s.getOrCreateSeries(fp, sample.Metric) if sample.Timestamp <= series.lastTime { // Don't log and track equal timestamps, as they are a common occurrence // when using client-side timestamps (e.g. Pushgateway or federation). // It would be even better to also compare the sample values here, but // we don't have efficient access to a series's last value. if sample.Timestamp != series.lastTime { log.Warnf("Ignoring sample with out-of-order timestamp for fingerprint %v (%v): %v is not after %v", fp, series.metric, sample.Timestamp, series.lastTime) s.outOfOrderSamplesCount.Inc() } s.fpLocker.Unlock(fp) return } completedChunksCount := series.add(&model.SamplePair{ Value: sample.Value, Timestamp: sample.Timestamp, }) s.fpLocker.Unlock(fp) s.ingestedSamplesCount.Inc() s.incNumChunksToPersist(completedChunksCount) }
func (t *StorageQueueManager) sendSamples(s model.Samples) { t.sendSemaphore <- true go func() { defer func() { <-t.sendSemaphore }() // Samples are sent to the remote storage on a best-effort basis. If a // sample isn't sent correctly the first time, it's simply dropped on the // floor. begin := time.Now() err := t.tsdb.Store(s) duration := time.Since(begin).Seconds() labelValue := success if err != nil { log.Warnf("error sending %d samples to remote storage: %s", len(s), err) labelValue = failure t.failedBatches.Inc() t.failedSamples.Add(float64(len(s))) } t.samplesCount.WithLabelValues(labelValue).Add(float64(len(s))) t.sendLatency.Observe(duration) }() }
func lookupAll(name string, qtype uint16) (*dns.Msg, error) { conf, err := dns.ClientConfigFromFile(resolvConf) if err != nil { return nil, fmt.Errorf("could not load resolv.conf: %s", err) } client := &dns.Client{} response := &dns.Msg{} for _, server := range conf.Servers { servAddr := net.JoinHostPort(server, conf.Port) for _, suffix := range conf.Search { response, err = lookup(name, qtype, client, servAddr, suffix, false) if err != nil { log.Warnf("resolving %s.%s failed: %s", name, suffix, err) continue } if len(response.Answer) > 0 { return response, nil } } response, err = lookup(name, qtype, client, servAddr, "", false) if err == nil { return response, nil } } return response, fmt.Errorf("could not resolve %s: No server responded", name) }
// Run dispatches notifications continuously. func (n *Notifier) Run() { numAMs := len(n.opts.AlertmanagerURLs) // Just warn once in the beginning to prevent noisy logs. if numAMs == 0 { log.Warnf("No AlertManagers configured, not dispatching any alerts") return } for { select { case <-n.ctx.Done(): return case <-n.more: } alerts := n.nextBatch() if numAMs > 0 { if len(alerts) > 0 { numErrors := n.sendAll(alerts...) // Increment the dropped counter if we could not send // successfully to a single AlertManager. if numErrors == numAMs { n.dropped.Add(float64(len(alerts))) } } } else { n.dropped.Add(float64(len(alerts))) } // If the queue still has items left, kick off the next iteration. if n.queueLen() > 0 { n.setMore() } } }
// Store sends a batch of samples to Graphite. func (c *Client) Store(samples model.Samples) error { conn, err := net.DialTimeout(c.transport, c.address, c.timeout) if err != nil { return err } defer conn.Close() var buf bytes.Buffer for _, s := range samples { k := pathFromMetric(s.Metric, c.prefix) t := float64(s.Timestamp.UnixNano()) / 1e9 v := float64(s.Value) if math.IsNaN(v) || math.IsInf(v, 0) { log.Warnf("cannot send value %f to Graphite,"+ "skipping sample %#v", v, s) continue } fmt.Fprintf(&buf, "%s %f %f\n", k, v, t) } _, err = conn.Write(buf.Bytes()) if err != nil { return err } return nil }
// Store sends a batch of samples to OpenTSDB via its HTTP API. func (c *Client) Store(samples model.Samples) error { reqs := make([]StoreSamplesRequest, 0, len(samples)) for _, s := range samples { v := float64(s.Value) if math.IsNaN(v) || math.IsInf(v, 0) { log.Warnf("cannot send value %f to OpenTSDB, skipping sample %#v", v, s) continue } metric := TagValue(s.Metric[model.MetricNameLabel]) reqs = append(reqs, StoreSamplesRequest{ Metric: metric, Timestamp: s.Timestamp.Unix(), Value: v, Tags: tagsFromMetric(s.Metric), }) } u, err := url.Parse(c.url) if err != nil { return err } u.Path = putEndpoint buf, err := json.Marshal(reqs) if err != nil { return err } resp, err := c.httpClient.Post( u.String(), contentTypeJSON, bytes.NewBuffer(buf), ) if err != nil { return err } defer resp.Body.Close() // API returns status code 204 for successful writes. // http://opentsdb.net/docs/build/html/api_http/put.html if resp.StatusCode == http.StatusNoContent { return nil } // API returns status code 400 on error, encoding error details in the // response content in JSON. buf, err = ioutil.ReadAll(resp.Body) if err != nil { return err } var r map[string]int if err := json.Unmarshal(buf, &r); err != nil { return err } return fmt.Errorf("failed to write %d samples to OpenTSDB, %d succeeded", r["failed"], r["success"]) }
func (m *Manager) GetRuleAlertNotifications(rule *AlertingRule, timestamp model.Time) notification.NotificationReqs { activeAlerts := rule.ActiveAlerts() if len(activeAlerts) == 0 { return notification.NotificationReqs{} } notifications := make(notification.NotificationReqs, 0, len(activeAlerts)) for _, aa := range activeAlerts { if aa.State != StateFiring { // BUG: In the future, make AlertManager support pending alerts? continue } // Provide the alert information to the template. l := map[string]string{} for k, v := range aa.Labels { l[string(k)] = string(v) } tmplData := struct { Labels map[string]string Value float64 }{ Labels: l, Value: float64(aa.Value), } // Inject some convenience variables that are easier to remember for users // who are not used to Go's templating system. defs := "{{$labels := .Labels}}{{$value := .Value}}" expand := func(text string) string { tmpl := template.NewTemplateExpander(defs+text, "__alert_"+rule.Name(), tmplData, timestamp, m.queryEngine, m.externalURL.Path) result, err := tmpl.Expand() if err != nil { result = err.Error() log.Warnf("Error expanding alert template %v with data '%v': %v", rule.Name(), tmplData, err) } return result } notifications = append(notifications, ¬ification.NotificationReq{ Summary: expand(rule.summary), Description: expand(rule.description), Runbook: rule.runbook, Labels: aa.Labels.Merge(model.LabelSet{ alertNameLabel: model.LabelValue(rule.Name()), }), Value: aa.Value, ActiveSince: aa.ActiveSince.Time(), RuleString: rule.String(), GeneratorURL: m.externalURL.String() + strutil.GraphLinkForExpression(rule.vector.String()), }) } return notifications }
// Send queues the given notification requests for processing. // Panics if called on a handler that is not running. func (n *Notifier) Send(alerts ...*model.Alert) { n.mtx.Lock() defer n.mtx.Unlock() // Attach external labels before relabelling and sending. for _, a := range alerts { for ln, lv := range n.opts.ExternalLabels { if _, ok := a.Labels[ln]; !ok { a.Labels[ln] = lv } } } alerts = n.relabelAlerts(alerts) // Queue capacity should be significantly larger than a single alert // batch could be. if d := len(alerts) - n.opts.QueueCapacity; d > 0 { alerts = alerts[d:] log.Warnf("Alert batch larger than queue capacity, dropping %d alerts", d) n.dropped.Add(float64(d)) } // If the queue is full, remove the oldest alerts in favor // of newer ones. if d := (len(n.queue) + len(alerts)) - n.opts.QueueCapacity; d > 0 { n.queue = n.queue[d:] log.Warnf("Alert notification queue full, dropping %d alerts", d) n.dropped.Add(float64(d)) } n.queue = append(n.queue, alerts...) // Notify sending goroutine that there are alerts to be processed. n.setMore() }
// eval runs a single evaluation cycle in which all rules are evaluated in parallel. // In the future a single group will be evaluated sequentially to properly handle // rule dependency. func (g *Group) eval() { var ( now = model.Now() wg sync.WaitGroup ) for _, rule := range g.rules { wg.Add(1) // BUG(julius): Look at fixing thundering herd. go func(rule Rule) { defer wg.Done() start := time.Now() evalTotal.Inc() vector, err := rule.eval(now, g.opts.QueryEngine) if err != nil { // Canceled queries are intentional termination of queries. This normally // happens on shutdown and thus we skip logging of any errors here. if _, ok := err.(promql.ErrQueryCanceled); !ok { log.Warnf("Error while evaluating rule %q: %s", rule, err) } evalFailures.Inc() } var rtyp ruleType switch r := rule.(type) { case *AlertingRule: rtyp = ruleTypeRecording g.sendAlerts(r, now) case *RecordingRule: rtyp = ruleTypeAlert default: panic(fmt.Errorf("unknown rule type: %T", rule)) } evalDuration.WithLabelValues(string(rtyp)).Observe( float64(time.Since(start)) / float64(time.Second), ) for _, s := range vector { g.opts.SampleAppender.Append(s) } }(rule) } wg.Wait() }
// isDegraded returns whether the storage is in "graceful degradation mode", // which is the case if the number of chunks waiting for persistence has reached // a percentage of maxChunksToPersist that exceeds // percentChunksToPersistForDegradation. The method is not goroutine safe (but // only ever called from the goroutine dealing with series maintenance). // Changes of degradation mode are logged. func (s *memorySeriesStorage) isDegraded() bool { nowDegraded := s.getNumChunksToPersist() > s.maxChunksToPersist*percentChunksToPersistForDegradation/100 if s.degraded && !nowDegraded { log.Warn("Storage has left graceful degradation mode. Things are back to normal.") } else if !s.degraded && nowDegraded { log.Warnf( "%d chunks waiting for persistence (%d%% of the allowed maximum %d). Storage is now in graceful degradation mode. Series files are not synced anymore if following the adaptive strategy. Checkpoints are not performed more often than every %v. Series maintenance happens as frequently as possible.", s.getNumChunksToPersist(), s.getNumChunksToPersist()*100/s.maxChunksToPersist, s.maxChunksToPersist, s.checkpointInterval) } s.degraded = nowDegraded return s.degraded }
// maybeAddMapping adds a fingerprint mapping to fpm if the FastFingerprint of m is different from fp. func maybeAddMapping(fp model.Fingerprint, m model.Metric, fpm fpMappings) { if rawFP := m.FastFingerprint(); rawFP != fp { log.Warnf( "Metric %v with fingerprint %v is mapped from raw fingerprint %v.", m, fp, rawFP, ) if mappedFPs, ok := fpm[rawFP]; ok { mappedFPs[metricToUniqueString(m)] = fp } else { fpm[rawFP] = map[string]model.Fingerprint{ metricToUniqueString(m): fp, } } } }
func (sl *scrapeLoop) append(samples model.Samples) { numOutOfOrder := 0 for _, s := range samples { if err := sl.appender.Append(s); err != nil { if err == local.ErrOutOfOrderSample { numOutOfOrder++ } else { log.Warnf("Error inserting sample: %s", err) } } } if numOutOfOrder > 0 { log.With("numDropped", numOutOfOrder).Warn("Error on ingesting out-of-order samples") } }
func (t *StorageQueueManager) sendSamples(s model.Samples) { // Samples are sent to the remote storage on a best-effort basis. If a // sample isn't sent correctly the first time, it's simply dropped on the // floor. begin := time.Now() err := t.tsdb.Store(s) duration := time.Since(begin).Seconds() labelValue := success if err != nil { log.Warnf("error sending %d samples to remote storage: %s", len(s), err) labelValue = failure } t.sentSamplesTotal.WithLabelValues(labelValue).Add(float64(len(s))) t.sentBatchDuration.WithLabelValues(labelValue).Observe(duration) }
func (m *Manager) runIteration() { now := model.Now() wg := sync.WaitGroup{} m.Lock() rulesSnapshot := make([]Rule, len(m.rules)) copy(rulesSnapshot, m.rules) m.Unlock() for _, rule := range rulesSnapshot { wg.Add(1) // BUG(julius): Look at fixing thundering herd. go func(rule Rule) { defer wg.Done() start := time.Now() vector, err := rule.eval(now, m.queryEngine) duration := time.Since(start) if err != nil { evalFailures.Inc() log.Warnf("Error while evaluating rule %q: %s", rule, err) return } switch r := rule.(type) { case *AlertingRule: m.queueAlertNotifications(r, now) evalDuration.WithLabelValues(ruleTypeAlerting).Observe( float64(duration / time.Millisecond), ) case *RecordingRule: evalDuration.WithLabelValues(ruleTypeRecording).Observe( float64(duration / time.Millisecond), ) default: panic(fmt.Errorf("unknown rule type: %T", rule)) } for _, s := range vector { m.sampleAppender.Append(s) } }(rule) } wg.Wait() }
func (dd *Discovery) refresh(ctx context.Context, name string, ch chan<- []*config.TargetGroup) error { response, err := lookupAll(name, dd.qtype) dnsSDLookupsCount.Inc() if err != nil { dnsSDLookupFailuresCount.Inc() return err } tg := &config.TargetGroup{} hostPort := func(a string, p int) model.LabelValue { return model.LabelValue(net.JoinHostPort(a, fmt.Sprintf("%d", p))) } for _, record := range response.Answer { target := model.LabelValue("") switch addr := record.(type) { case *dns.SRV: // Remove the final dot from rooted DNS names to make them look more usual. addr.Target = strings.TrimRight(addr.Target, ".") target = hostPort(addr.Target, int(addr.Port)) case *dns.A: target = hostPort(addr.A.String(), dd.port) case *dns.AAAA: target = hostPort(addr.AAAA.String(), dd.port) default: log.Warnf("%q is not a valid SRV record", record) continue } tg.Targets = append(tg.Targets, model.LabelSet{ model.AddressLabel: target, dnsNameLabel: model.LabelValue(name), }) } tg.Source = name select { case <-ctx.Done(): return ctx.Err() case ch <- []*config.TargetGroup{tg}: } return nil }
// eval runs a single evaluation cycle in which all rules are evaluated in parallel. // In the future a single group will be evaluated sequentially to properly handle // rule dependency. func (g *Group) eval() { var ( now = model.Now() wg sync.WaitGroup ) for _, rule := range g.rules { wg.Add(1) // BUG(julius): Look at fixing thundering herd. go func(rule Rule) { defer wg.Done() start := time.Now() evalTotal.Inc() vector, err := rule.eval(now, g.opts.QueryEngine) if err != nil { evalFailures.Inc() log.Warnf("Error while evaluating rule %q: %s", rule, err) } var rtyp ruleType switch r := rule.(type) { case *AlertingRule: rtyp = ruleTypeRecording g.sendAlerts(r, now) case *RecordingRule: rtyp = ruleTypeAlert default: panic(fmt.Errorf("unknown rule type: %T", rule)) } evalDuration.WithLabelValues(string(rtyp)).Observe( float64(time.Since(start)) / float64(time.Second), ) for _, s := range vector { g.opts.SampleAppender.Append(s) } }(rule) } wg.Wait() }
// GetMetricFamilies implements the MetricStore interface. func (dms *DiskMetricStore) GetMetricFamilies() []*dto.MetricFamily { result := []*dto.MetricFamily{} mfStatByName := map[string]mfStat{} dms.lock.RLock() defer dms.lock.RUnlock() for _, group := range dms.metricGroups { for name, tmf := range group.Metrics { mf := tmf.MetricFamily stat, exists := mfStatByName[name] if exists { existingMF := result[stat.pos] if !stat.copied { mfStatByName[name] = mfStat{ pos: stat.pos, copied: true, } existingMF = copyMetricFamily(existingMF) result[stat.pos] = existingMF } if mf.GetHelp() != existingMF.GetHelp() || mf.GetType() != existingMF.GetType() { log.Warnf( "Metric families '%s' and '%s' are inconsistent, help and type of the latter will have priority. This is bad. Fix your pushed metrics!", mf, existingMF, ) } for _, metric := range mf.Metric { existingMF.Metric = append(existingMF.Metric, metric) } } else { mfStatByName[name] = mfStat{ pos: len(result), copied: false, } result = append(result, mf) } } } return result }
// eval runs a single evaluation cycle in which all rules are evaluated in parallel. // In the future a single group will be evaluated sequentially to properly handle // rule dependency. func (g *Group) eval() { var ( now = model.Now() wg sync.WaitGroup ) for _, rule := range g.rules { rtyp := string(typeForRule(rule)) wg.Add(1) // BUG(julius): Look at fixing thundering herd. go func(rule Rule) { defer wg.Done() defer func(t time.Time) { evalDuration.WithLabelValues(rtyp).Observe(float64(time.Since(t)) / float64(time.Second)) }(time.Now()) evalTotal.WithLabelValues(rtyp).Inc() vector, err := rule.eval(now, g.opts.QueryEngine) if err != nil { // Canceled queries are intentional termination of queries. This normally // happens on shutdown and thus we skip logging of any errors here. if _, ok := err.(promql.ErrQueryCanceled); !ok { log.Warnf("Error while evaluating rule %q: %s", rule, err) } evalFailures.WithLabelValues(rtyp).Inc() return } if ar, ok := rule.(*AlertingRule); ok { g.sendAlerts(ar, now) } for _, s := range vector { g.opts.SampleAppender.Append(s) } }(rule) } wg.Wait() }
// Run dispatches notifications continuously. func (n *Handler) Run() { // Just warn once in the beginning to prevent noisy logs. if n.opts.AlertmanagerURL == "" { log.Warnf("No AlertManager configured, not dispatching any alerts") } for { select { case <-n.ctx.Done(): return case <-n.more: } alerts := n.nextBatch() if len(alerts) == 0 { continue } if n.opts.AlertmanagerURL == "" { n.dropped.Add(float64(len(alerts))) continue } begin := time.Now() if err := n.send(alerts...); err != nil { log.Errorf("Error sending %d alerts: %s", len(alerts), err) n.errors.Inc() n.dropped.Add(float64(len(alerts))) } n.latency.Observe(float64(time.Since(begin)) / float64(time.Second)) n.sent.Add(float64(len(alerts))) // If the queue still has items left, kick off the next iteration. if n.queueLen() > 0 { n.setMore() } } }
// UnmarshalYAML implements the yaml.Unmarshaler interface. func (c *ScrapeConfig) UnmarshalYAML(unmarshal func(interface{}) error) error { *c = DefaultScrapeConfig type plain ScrapeConfig err := unmarshal((*plain)(c)) if err != nil { return err } if err := checkOverflow(c.XXX, "scrape_config"); err != nil { return err } if !patJobName.MatchString(c.JobName) { return fmt.Errorf("%q is not a valid job name", c.JobName) } if len(c.BearerToken) > 0 && len(c.BearerTokenFile) > 0 { return fmt.Errorf("at most one of bearer_token & bearer_token_file must be configured") } if c.BasicAuth != nil && (len(c.BearerToken) > 0 || len(c.BearerTokenFile) > 0) { return fmt.Errorf("at most one of basic_auth, bearer_token & bearer_token_file must be configured") } // Check `target_groups` deprecation. if c.TargetGroups != nil && c.StaticConfigs != nil { return fmt.Errorf("'target_groups' is deprecated, configure static targets via 'static_configs' only") } if c.TargetGroups != nil { log.Warnf("The 'target_groups' option for scrape configurations is deprecated, use 'static_configs' instead") c.StaticConfigs = c.TargetGroups } // Check for users putting URLs in target groups. if len(c.RelabelConfigs) == 0 { for _, tg := range c.StaticConfigs { for _, t := range tg.Targets { if err = CheckTargetAddress(t[model.AddressLabel]); err != nil { return err } } } } return nil }
func (dd *DNSDiscovery) refresh(name string, ch chan<- []*config.TargetGroup) error { response, err := lookupAll(name, dd.qtype) dnsSDLookupsCount.Inc() if err != nil { dnsSDLookupFailuresCount.Inc() return err } tg := &config.TargetGroup{} for _, record := range response.Answer { target := model.LabelValue("") switch addr := record.(type) { case *dns.SRV: // Remove the final dot from rooted DNS names to make them look more usual. addr.Target = strings.TrimRight(addr.Target, ".") target = model.LabelValue(fmt.Sprintf("%s:%d", addr.Target, addr.Port)) case *dns.A: target = model.LabelValue(fmt.Sprintf("%s:%d", addr.A, dd.port)) case *dns.AAAA: target = model.LabelValue(fmt.Sprintf("%s:%d", addr.AAAA, dd.port)) default: log.Warnf("%q is not a valid SRV record", record) continue } tg.Targets = append(tg.Targets, model.LabelSet{ model.AddressLabel: target, dnsNameLabel: model.LabelValue(name), }) } tg.Source = name ch <- []*config.TargetGroup{tg} return nil }
// Notify calls the underlying notifier with exponential backoff until it succeeds. // It aborts if the context is canceled or timed out. func (n *RetryNotifier) Notify(ctx context.Context, alerts ...*types.Alert) error { var ( i = 0 b = backoff.NewExponentialBackOff() tick = backoff.NewTicker(b) ) defer tick.Stop() for { i++ select { case <-tick.C: if err := n.notifier.Notify(ctx, alerts...); err != nil { log.Warnf("Notify attempt %d failed: %s", i, err) } else { return nil } case <-ctx.Done(): return ctx.Err() } } }
// sanitizeSeries sanitizes a series based on its series file as defined by the // provided directory and FileInfo. The method returns the fingerprint as // derived from the directory and file name, and whether the provided file has // been sanitized. A file that failed to be sanitized is moved into the // "orphaned" sub-directory, if possible. // // The following steps are performed: // // - A file whose name doesn't comply with the naming scheme of a series file is // simply moved into the orphaned directory. // // - If the size of the series file isn't a multiple of the chunk size, // extraneous bytes are truncated. If the truncation fails, the file is // moved into the orphaned directory. // // - A file that is empty (after truncation) is deleted. // // - A series that is not archived (i.e. it is in the fingerprintToSeries map) // is checked for consistency of its various parameters (like persist // watermark, offset of chunkDescs etc.). In particular, overlap between an // in-memory head chunk with the most recent persisted chunk is // checked. Inconsistencies are rectified. // // - A series that is archived (i.e. it is not in the fingerprintToSeries map) // is checked for its presence in the index of archived series. If it cannot // be found there, it is moved into the orphaned directory. func (p *persistence) sanitizeSeries( dirname string, fi os.FileInfo, fingerprintToSeries map[model.Fingerprint]*memorySeries, fpm fpMappings, ) (model.Fingerprint, bool) { var ( fp model.Fingerprint err error filename = filepath.Join(dirname, fi.Name()) s *memorySeries ) purge := func() { if fp != 0 { var metric model.Metric if s != nil { metric = s.metric } if err = p.quarantineSeriesFile( fp, errors.New("purge during crash recovery"), metric, ); err == nil { return } log. With("file", filename). With("error", err). Error("Failed to move lost series file to orphaned directory.") } // If we are here, we are either purging an incorrectly named // file, or quarantining has failed. So simply delete the file. if err = os.Remove(filename); err != nil { log. With("file", filename). With("error", err). Error("Failed to delete lost series file.") } } if len(fi.Name()) != fpLen-seriesDirNameLen+len(seriesFileSuffix) || !strings.HasSuffix(fi.Name(), seriesFileSuffix) { log.Warnf("Unexpected series file name %s.", filename) purge() return fp, false } if fp, err = model.FingerprintFromString(filepath.Base(dirname) + fi.Name()[:fpLen-seriesDirNameLen]); err != nil { log.Warnf("Error parsing file name %s: %s", filename, err) purge() return fp, false } bytesToTrim := fi.Size() % int64(chunkLenWithHeader) chunksInFile := int(fi.Size()) / chunkLenWithHeader modTime := fi.ModTime() if bytesToTrim != 0 { log.Warnf( "Truncating file %s to exactly %d chunks, trimming %d extraneous bytes.", filename, chunksInFile, bytesToTrim, ) f, err := os.OpenFile(filename, os.O_WRONLY, 0640) if err != nil { log.Errorf("Could not open file %s: %s", filename, err) purge() return fp, false } if err := f.Truncate(fi.Size() - bytesToTrim); err != nil { log.Errorf("Failed to truncate file %s: %s", filename, err) purge() return fp, false } } if chunksInFile == 0 { log.Warnf("No chunks left in file %s.", filename) purge() return fp, false } s, ok := fingerprintToSeries[fp] if ok { // This series is supposed to not be archived. if s == nil { panic("fingerprint mapped to nil pointer") } maybeAddMapping(fp, s.metric, fpm) if !p.pedanticChecks && bytesToTrim == 0 && s.chunkDescsOffset != -1 && chunksInFile == s.chunkDescsOffset+s.persistWatermark && modTime.Equal(s.modTime) { // Everything is consistent. We are good. return fp, true } // If we are here, we cannot be sure the series file is // consistent with the checkpoint, so we have to take a closer // look. if s.headChunkClosed { // This is the easy case as we have all chunks on // disk. Treat this series as a freshly unarchived one // by loading the chunkDescs and setting all parameters // based on the loaded chunkDescs. cds, err := p.loadChunkDescs(fp, 0) if err != nil { log.Errorf( "Failed to load chunk descriptors for metric %v, fingerprint %v: %s", s.metric, fp, err, ) purge() return fp, false } log.Warnf( "Treating recovered metric %v, fingerprint %v, as freshly unarchived, with %d chunks in series file.", s.metric, fp, len(cds), ) s.chunkDescs = cds s.chunkDescsOffset = 0 s.savedFirstTime = cds[0].FirstTime() s.lastTime, err = cds[len(cds)-1].LastTime() if err != nil { log.Errorf( "Failed to determine time of the last sample for metric %v, fingerprint %v: %s", s.metric, fp, err, ) purge() return fp, false } s.persistWatermark = len(cds) s.modTime = modTime return fp, true } // This is the tricky one: We have chunks from heads.db, but // some of those chunks might already be in the series // file. Strategy: Take the last time of the most recent chunk // in the series file. Then find the oldest chunk among those // from heads.db that has a first time later or equal to the // last time from the series file. Throw away the older chunks // from heads.db and stitch the parts together. // First, throw away the chunkDescs without chunks. s.chunkDescs = s.chunkDescs[s.persistWatermark:] chunk.NumMemDescs.Sub(float64(s.persistWatermark)) cds, err := p.loadChunkDescs(fp, 0) if err != nil { log.Errorf( "Failed to load chunk descriptors for metric %v, fingerprint %v: %s", s.metric, fp, err, ) purge() return fp, false } s.persistWatermark = len(cds) s.chunkDescsOffset = 0 s.savedFirstTime = cds[0].FirstTime() s.modTime = modTime lastTime, err := cds[len(cds)-1].LastTime() if err != nil { log.Errorf( "Failed to determine time of the last sample for metric %v, fingerprint %v: %s", s.metric, fp, err, ) purge() return fp, false } keepIdx := -1 for i, cd := range s.chunkDescs { if cd.FirstTime() >= lastTime { keepIdx = i break } } if keepIdx == -1 { log.Warnf( "Recovered metric %v, fingerprint %v: all %d chunks recovered from series file.", s.metric, fp, chunksInFile, ) chunk.NumMemDescs.Sub(float64(len(s.chunkDescs))) atomic.AddInt64(&chunk.NumMemChunks, int64(-len(s.chunkDescs))) s.chunkDescs = cds s.headChunkClosed = true return fp, true } log.Warnf( "Recovered metric %v, fingerprint %v: recovered %d chunks from series file, recovered %d chunks from checkpoint.", s.metric, fp, chunksInFile, len(s.chunkDescs)-keepIdx, ) chunk.NumMemDescs.Sub(float64(keepIdx)) atomic.AddInt64(&chunk.NumMemChunks, int64(-keepIdx)) if keepIdx == len(s.chunkDescs) { // No chunks from series file left, head chunk is evicted, so declare it closed. s.headChunkClosed = true } s.chunkDescs = append(cds, s.chunkDescs[keepIdx:]...) return fp, true } // This series is supposed to be archived. metric, err := p.archivedMetric(fp) if err != nil { log.Errorf( "Fingerprint %v assumed archived but couldn't be looked up in archived index: %s", fp, err, ) purge() return fp, false } if metric == nil { log.Warnf( "Fingerprint %v assumed archived but couldn't be found in archived index.", fp, ) purge() return fp, false } // This series looks like a properly archived one. maybeAddMapping(fp, metric, fpm) return fp, true }
// alertStmt parses an alert rule. // // ALERT name IF expr [FOR duration] // [LABELS label_set] // [ANNOTATIONS label_set] // func (p *parser) alertStmt() *AlertStmt { const ctx = "alert statement" p.expect(itemAlert, ctx) name := p.expect(itemIdentifier, ctx) // Alerts require a vector typed expression. p.expect(itemIf, ctx) expr := p.expr() // Optional for clause. var ( duration time.Duration err error ) if p.peek().typ == itemFor { p.next() dur := p.expect(itemDuration, ctx) duration, err = parseDuration(dur.val) if err != nil { p.error(err) } } // Accepting WITH instead of LABELS is temporary compatibility // with the old alerting syntax. var ( hasLabels bool oldSyntax bool labels = model.LabelSet{} annotations = model.LabelSet{} ) if t := p.peek().typ; t == itemLabels { p.expect(itemLabels, ctx) labels = p.labelSet() hasLabels = true } else if t == itemWith { p.expect(itemWith, ctx) labels = p.labelSet() oldSyntax = true } // Only allow old annotation syntax if new label syntax isn't used. if !hasLabels { Loop: for { switch p.next().typ { case itemSummary: annotations["summary"] = model.LabelValue(p.unquoteString(p.expect(itemString, ctx).val)) case itemDescription: annotations["description"] = model.LabelValue(p.unquoteString(p.expect(itemString, ctx).val)) case itemRunbook: annotations["runbook"] = model.LabelValue(p.unquoteString(p.expect(itemString, ctx).val)) default: p.backup() break Loop } } if len(annotations) > 0 { oldSyntax = true } } // Only allow new annotation syntax if WITH or old annotation // syntax weren't used. if !oldSyntax { if p.peek().typ == itemAnnotations { p.expect(itemAnnotations, ctx) annotations = p.labelSet() } } else { log.Warnf("Alerting rule with old syntax found. Support for this syntax will be removed with 0.18. Please update to the new syntax.") } return &AlertStmt{ Name: name.val, Expr: expr, Duration: duration, Labels: labels, Annotations: annotations, } }
// eval runs a single evaluation cycle in which all rules are evaluated in parallel. // In the future a single group will be evaluated sequentially to properly handle // rule dependency. func (g *Group) eval() { var ( now = model.Now() wg sync.WaitGroup ) for _, rule := range g.rules { rtyp := string(typeForRule(rule)) wg.Add(1) // BUG(julius): Look at fixing thundering herd. go func(rule Rule) { defer wg.Done() defer func(t time.Time) { evalDuration.WithLabelValues(rtyp).Observe(time.Since(t).Seconds()) }(time.Now()) evalTotal.WithLabelValues(rtyp).Inc() vector, err := rule.eval(g.opts.Context, now, g.opts.QueryEngine, g.opts.ExternalURL.Path) if err != nil { // Canceled queries are intentional termination of queries. This normally // happens on shutdown and thus we skip logging of any errors here. if _, ok := err.(promql.ErrQueryCanceled); !ok { log.Warnf("Error while evaluating rule %q: %s", rule, err) } evalFailures.WithLabelValues(rtyp).Inc() return } if ar, ok := rule.(*AlertingRule); ok { g.sendAlerts(ar, now) } var ( numOutOfOrder = 0 numDuplicates = 0 ) for _, s := range vector { if err := g.opts.SampleAppender.Append(s); err != nil { switch err { case local.ErrOutOfOrderSample: numOutOfOrder++ log.With("sample", s).With("error", err).Debug("Rule evaluation result discarded") case local.ErrDuplicateSampleForTimestamp: numDuplicates++ log.With("sample", s).With("error", err).Debug("Rule evaluation result discarded") default: log.With("sample", s).With("error", err).Warn("Rule evaluation result discarded") } } } if numOutOfOrder > 0 { log.With("numDropped", numOutOfOrder).Warn("Error on ingesting out-of-order result from rule evaluation") } if numDuplicates > 0 { log.With("numDropped", numDuplicates).Warn("Error on ingesting results from rule evaluation with different value but same timestamp") } }(rule) } wg.Wait() }
// eval evaluates the rule expression and then creates pending alerts and fires // or removes previously pending alerts accordingly. func (r *AlertingRule) eval(ts model.Time, engine *promql.Engine, externalURLPath string) (model.Vector, error) { query, err := engine.NewInstantQuery(r.vector.String(), ts) if err != nil { return nil, err } res, err := query.Exec().Vector() if err != nil { return nil, err } r.mtx.Lock() defer r.mtx.Unlock() // Create pending alerts for any new vector elements in the alert expression // or update the expression value for existing elements. resultFPs := map[model.Fingerprint]struct{}{} for _, smpl := range res { // Provide the alert information to the template. l := make(map[string]string, len(smpl.Metric)) for k, v := range smpl.Metric { l[string(k)] = string(v) } tmplData := struct { Labels map[string]string Value float64 }{ Labels: l, Value: float64(smpl.Value), } // Inject some convenience variables that are easier to remember for users // who are not used to Go's templating system. defs := "{{$labels := .Labels}}{{$value := .Value}}" expand := func(text model.LabelValue) model.LabelValue { tmpl := template.NewTemplateExpander( defs+string(text), "__alert_"+r.Name(), tmplData, ts, engine, externalURLPath, ) result, err := tmpl.Expand() if err != nil { result = fmt.Sprintf("<error expanding template: %s>", err) log.Warnf("Error expanding alert template %v with data '%v': %s", r.Name(), tmplData, err) } return model.LabelValue(result) } labels := make(model.LabelSet, len(smpl.Metric)+len(r.labels)+1) for ln, lv := range smpl.Metric { labels[ln] = lv } for ln, lv := range r.labels { labels[ln] = expand(lv) } labels[model.AlertNameLabel] = model.LabelValue(r.Name()) annotations := make(model.LabelSet, len(r.annotations)) for an, av := range r.annotations { annotations[an] = expand(av) } fp := smpl.Metric.Fingerprint() resultFPs[fp] = struct{}{} if alert, ok := r.active[fp]; ok && alert.State != StateInactive { alert.Value = smpl.Value continue } delete(smpl.Metric, model.MetricNameLabel) r.active[fp] = &Alert{ Labels: labels, Annotations: annotations, ActiveAt: ts, State: StatePending, Value: smpl.Value, } } var vec model.Vector // Check if any pending alerts should be removed or fire now. Write out alert timeseries. for fp, a := range r.active { if _, ok := resultFPs[fp]; !ok { if a.State != StateInactive { vec = append(vec, r.sample(a, ts, false)) } // If the alert was previously firing, keep it around for a given // retention time so it is reported as resolved to the AlertManager. if a.State == StatePending || (a.ResolvedAt != 0 && ts.Sub(a.ResolvedAt) > resolvedRetention) { delete(r.active, fp) } if a.State != StateInactive { a.State = StateInactive a.ResolvedAt = ts } continue } if a.State == StatePending && ts.Sub(a.ActiveAt) >= r.holdDuration { vec = append(vec, r.sample(a, ts, false)) a.State = StateFiring } vec = append(vec, r.sample(a, ts, true)) } return vec, nil }
// recoverFromCrash is called by loadSeriesMapAndHeads if the persistence // appears to be dirty after the loading (either because the loading resulted in // an error or because the persistence was dirty from the start). Not goroutine // safe. Only call before anything else is running (except index processing // queue as started by newPersistence). func (p *persistence) recoverFromCrash(fingerprintToSeries map[model.Fingerprint]*memorySeries) error { // TODO(beorn): We need proper tests for the crash recovery. log.Warn("Starting crash recovery. Prometheus is inoperational until complete.") log.Warn("To avoid crash recovery in the future, shut down Prometheus with SIGTERM or a HTTP POST to /-/quit.") fpsSeen := map[model.Fingerprint]struct{}{} count := 0 seriesDirNameFmt := fmt.Sprintf("%%0%dx", seriesDirNameLen) // Delete the fingerprint mapping file as it might be stale or // corrupt. We'll rebuild the mappings as we go. if err := os.RemoveAll(p.mappingsFileName()); err != nil { return fmt.Errorf("couldn't remove old fingerprint mapping file %s: %s", p.mappingsFileName(), err) } // The mappings to rebuild. fpm := fpMappings{} log.Info("Scanning files.") for i := 0; i < 1<<(seriesDirNameLen*4); i++ { dirname := filepath.Join(p.basePath, fmt.Sprintf(seriesDirNameFmt, i)) dir, err := os.Open(dirname) if os.IsNotExist(err) { continue } if err != nil { return err } for fis := []os.FileInfo{}; err != io.EOF; fis, err = dir.Readdir(1024) { if err != nil { dir.Close() return err } for _, fi := range fis { fp, ok := p.sanitizeSeries(dirname, fi, fingerprintToSeries, fpm) if ok { fpsSeen[fp] = struct{}{} } count++ if count%10000 == 0 { log.Infof("%d files scanned.", count) } } } dir.Close() } log.Infof("File scan complete. %d series found.", len(fpsSeen)) log.Info("Checking for series without series file.") for fp, s := range fingerprintToSeries { if _, seen := fpsSeen[fp]; !seen { // fp exists in fingerprintToSeries, but has no representation on disk. if s.persistWatermark == len(s.chunkDescs) { // Oops, everything including the head chunk was // already persisted, but nothing on disk. // Thus, we lost that series completely. Clean // up the remnants. delete(fingerprintToSeries, fp) if err := p.purgeArchivedMetric(fp); err != nil { // Purging the archived metric didn't work, so try // to unindex it, just in case it's in the indexes. p.unindexMetric(fp, s.metric) } log.Warnf("Lost series detected: fingerprint %v, metric %v.", fp, s.metric) continue } // If we are here, the only chunks we have are the chunks in the checkpoint. // Adjust things accordingly. if s.persistWatermark > 0 || s.chunkDescsOffset != 0 { minLostChunks := s.persistWatermark + s.chunkDescsOffset if minLostChunks <= 0 { log.Warnf( "Possible loss of chunks for fingerprint %v, metric %v.", fp, s.metric, ) } else { log.Warnf( "Lost at least %d chunks for fingerprint %v, metric %v.", minLostChunks, fp, s.metric, ) } s.chunkDescs = append( make([]*chunk.Desc, 0, len(s.chunkDescs)-s.persistWatermark), s.chunkDescs[s.persistWatermark:]..., ) chunk.NumMemDescs.Sub(float64(s.persistWatermark)) s.persistWatermark = 0 s.chunkDescsOffset = 0 } maybeAddMapping(fp, s.metric, fpm) fpsSeen[fp] = struct{}{} // Add so that fpsSeen is complete. } } log.Info("Check for series without series file complete.") if err := p.cleanUpArchiveIndexes(fingerprintToSeries, fpsSeen, fpm); err != nil { return err } if err := p.rebuildLabelIndexes(fingerprintToSeries); err != nil { return err } // Finally rewrite the mappings file if there are any mappings. if len(fpm) > 0 { if err := p.checkpointFPMappings(fpm); err != nil { return err } } p.dirtyMtx.Lock() // Only declare storage clean if it didn't become dirty during crash recovery. if !p.becameDirty { p.dirty = false } p.dirtyMtx.Unlock() log.Warn("Crash recovery complete.") return nil }
func warnDeprecated(collector string) { log.Warnf("The %s collector is deprecated and will be removed in the future!", collector) }
func (p *persistence) cleanUpArchiveIndexes( fpToSeries map[model.Fingerprint]*memorySeries, fpsSeen map[model.Fingerprint]struct{}, fpm fpMappings, ) error { log.Info("Cleaning up archive indexes.") var fp codable.Fingerprint var m codable.Metric count := 0 if err := p.archivedFingerprintToMetrics.ForEach(func(kv index.KeyValueAccessor) error { count++ if count%10000 == 0 { log.Infof("%d archived metrics checked.", count) } if err := kv.Key(&fp); err != nil { return err } _, fpSeen := fpsSeen[model.Fingerprint(fp)] inMemory := false if fpSeen { _, inMemory = fpToSeries[model.Fingerprint(fp)] } if !fpSeen || inMemory { if inMemory { log.Warnf("Archive clean-up: Fingerprint %v is not archived. Purging from archive indexes.", model.Fingerprint(fp)) } if !fpSeen { log.Warnf("Archive clean-up: Fingerprint %v is unknown. Purging from archive indexes.", model.Fingerprint(fp)) } // It's fine if the fp is not in the archive indexes. if _, err := p.archivedFingerprintToMetrics.Delete(fp); err != nil { return err } // Delete from timerange index, too. _, err := p.archivedFingerprintToTimeRange.Delete(fp) return err } // fp is legitimately archived. Now we need the metric to check for a mapped fingerprint. if err := kv.Value(&m); err != nil { return err } maybeAddMapping(model.Fingerprint(fp), model.Metric(m), fpm) // Make sure it is in timerange index, too. has, err := p.archivedFingerprintToTimeRange.Has(fp) if err != nil { return err } if has { return nil // All good. } log.Warnf("Archive clean-up: Fingerprint %v is not in time-range index. Unarchiving it for recovery.") // Again, it's fine if fp is not in the archive index. if _, err := p.archivedFingerprintToMetrics.Delete(fp); err != nil { return err } cds, err := p.loadChunkDescs(model.Fingerprint(fp), 0) if err != nil { return err } series, err := newMemorySeries(model.Metric(m), cds, p.seriesFileModTime(model.Fingerprint(fp))) if err != nil { return err } fpToSeries[model.Fingerprint(fp)] = series return nil }); err != nil { return err } count = 0 if err := p.archivedFingerprintToTimeRange.ForEach(func(kv index.KeyValueAccessor) error { count++ if count%10000 == 0 { log.Infof("%d archived time ranges checked.", count) } if err := kv.Key(&fp); err != nil { return err } has, err := p.archivedFingerprintToMetrics.Has(fp) if err != nil { return err } if has { return nil // All good. } log.Warnf("Archive clean-up: Purging unknown fingerprint %v in time-range index.", fp) deleted, err := p.archivedFingerprintToTimeRange.Delete(fp) if err != nil { return err } if !deleted { log.Errorf("Fingerprint %v to be deleted from archivedFingerprintToTimeRange not found. This should never happen.", fp) } return nil }); err != nil { return err } log.Info("Clean-up of archive indexes complete.") return nil }