func (sl *scrapeLoop) append(samples model.Samples) { var ( numOutOfOrder = 0 numDuplicates = 0 ) for _, s := range samples { if err := sl.appender.Append(s); err != nil { switch err { case local.ErrOutOfOrderSample: numOutOfOrder++ log.With("sample", s).With("error", err).Debug("Sample discarded") case local.ErrDuplicateSampleForTimestamp: numDuplicates++ log.With("sample", s).With("error", err).Debug("Sample discarded") default: log.With("sample", s).With("error", err).Warn("Sample discarded") } } } if numOutOfOrder > 0 { log.With("numDropped", numOutOfOrder).Warn("Error on ingesting out-of-order samples") } if numDuplicates > 0 { log.With("numDropped", numDuplicates).Warn("Error on ingesting samples with different value but same timestamp") } }
func (sl *scrapeLoop) report(start time.Time, duration time.Duration, err error) { sl.scraper.report(start, duration, err) ts := model.TimeFromUnixNano(start.UnixNano()) var health model.SampleValue if err == nil { health = 1 } healthSample := &model.Sample{ Metric: model.Metric{ model.MetricNameLabel: scrapeHealthMetricName, }, Timestamp: ts, Value: health, } durationSample := &model.Sample{ Metric: model.Metric{ model.MetricNameLabel: scrapeDurationMetricName, }, Timestamp: ts, Value: model.SampleValue(float64(duration) / float64(time.Second)), } if err := sl.reportAppender.Append(healthSample); err != nil { log.With("sample", healthSample).With("error", err).Warn("Scrape health sample discarded") } if err := sl.reportAppender.Append(durationSample); err != nil { log.With("sample", durationSample).With("error", err).Warn("Scrape duration sample discarded") } }
// logThrottling handles logging of throttled events and has to be started as a // goroutine. It stops once s.loopStopping is closed. // // Logging strategy: Whenever Throttle() is called and returns true, an signal // is sent to s.throttled. If that happens for the first time, an Error is // logged that the storage is now throttled. As long as signals continues to be // sent via s.throttled at least once per minute, nothing else is logged. Once // no signal has arrived for a minute, an Info is logged that the storage is not // throttled anymore. This resets things to the initial state, i.e. once a // signal arrives again, the Error will be logged again. func (s *memorySeriesStorage) logThrottling() { timer := time.NewTimer(time.Minute) timer.Stop() // Signal exit of the goroutine. Currently only needed by test code. defer close(s.logThrottlingStopped) for { select { case <-s.throttled: if !timer.Reset(time.Minute) { log. With("chunksToPersist", s.getNumChunksToPersist()). With("maxChunksToPersist", s.maxChunksToPersist). With("memoryChunks", atomic.LoadInt64(&numMemChunks)). With("maxToleratedMemChunks", int(float64(s.maxMemoryChunks)*toleranceFactorMemChunks)). Error("Storage needs throttling. Scrapes and rule evaluations will be skipped.") } case <-timer.C: log. With("chunksToPersist", s.getNumChunksToPersist()). With("maxChunksToPersist", s.maxChunksToPersist). With("memoryChunks", atomic.LoadInt64(&numMemChunks)). With("maxToleratedMemChunks", int(float64(s.maxMemoryChunks)*toleranceFactorMemChunks)). Info("Storage does not need throttling anymore.") case <-s.loopStopping: return } } }
// Notify implements the Notifier interface. func (n *Pushover) Notify(ctx context.Context, as ...*types.Alert) error { key, ok := GroupKey(ctx) if !ok { return fmt.Errorf("group key missing") } data := n.tmpl.Data(receiver(ctx), groupLabels(ctx), as...) log.With("incident", key).Debugln("notifying Pushover") var err error tmpl := tmplText(n.tmpl, data, &err) parameters := url.Values{} parameters.Add("token", tmpl(string(n.conf.Token))) parameters.Add("user", tmpl(string(n.conf.UserKey))) title := tmpl(n.conf.Title) message := tmpl(n.conf.Message) parameters.Add("title", title) if len(title) > 512 { title = title[:512] log.With("incident", key).Debugf("Truncated title to %q due to Pushover message limit", title) } if len(title)+len(message) > 512 { message = message[:512-len(title)] log.With("incident", key).Debugf("Truncated message to %q due to Pushover message limit", message) } message = strings.TrimSpace(message) if message == "" { // Pushover rejects empty messages. message = "(no details)" } parameters.Add("message", message) parameters.Add("url", tmpl(n.conf.URL)) parameters.Add("priority", tmpl(n.conf.Priority)) parameters.Add("retry", fmt.Sprintf("%d", int64(time.Duration(n.conf.Retry).Seconds()))) parameters.Add("expire", fmt.Sprintf("%d", int64(time.Duration(n.conf.Expire).Seconds()))) apiURL := "https://api.pushover.net/1/messages.json" u, err := url.Parse(apiURL) if err != nil { return err } u.RawQuery = parameters.Encode() log.With("incident", key).Debugf("Pushover URL = %q", u.String()) resp, err := ctxhttp.Post(ctx, http.DefaultClient, u.String(), "text/plain", nil) if err != nil { return err } defer resp.Body.Close() if resp.StatusCode/100 != 2 { body, err := ioutil.ReadAll(resp.Body) if err != nil { return err } return fmt.Errorf("unexpected status code %v (body: %s)", resp.StatusCode, string(body)) } return nil }
// calculatePersistenceUrgencyScore calculates and returns an urgency score for // the speed of persisting chunks. The score is between 0 and 1, where 0 means // no urgency at all and 1 means highest urgency. // // The score is the maximum of the two following sub-scores: // // (1) The first sub-score is the number of chunks waiting for persistence // divided by the maximum number of chunks allowed to be waiting for // persistence. // // (2) If there are more chunks in memory than allowed AND there are more chunks // waiting for persistence than factorMinChunksToPersist times // -storage.local.max-chunks-to-persist, then the second sub-score is the // fraction the number of memory chunks has reached between // -storage.local.memory-chunks and toleranceFactorForMemChunks times // -storage.local.memory-chunks. // // Should the score ever hit persintenceUrgencyScoreForEnteringRushedMode, the // storage locks into "rushed mode", in which the returned score is always // bumped up to 1 until the non-bumped score is below // persintenceUrgencyScoreForLeavingRushedMode. // // This method is not goroutine-safe, but it is only ever called by the single // goroutine that is in charge of series maintenance. According to the returned // score, series maintenance should be sped up. If a score of 1 is returned, // checkpointing based on dirty-series count should be disabled, and series // files should not by synced anymore provided the user has specified the // adaptive sync strategy. func (s *memorySeriesStorage) calculatePersistenceUrgencyScore() float64 { s.rushedMtx.Lock() defer s.rushedMtx.Unlock() var ( chunksToPersist = float64(s.getNumChunksToPersist()) maxChunksToPersist = float64(s.maxChunksToPersist) memChunks = float64(atomic.LoadInt64(&numMemChunks)) maxMemChunks = float64(s.maxMemoryChunks) ) score := chunksToPersist / maxChunksToPersist if chunksToPersist > maxChunksToPersist*factorMinChunksToPersist { score = math.Max( score, (memChunks/maxMemChunks-1)/(toleranceFactorMemChunks-1), ) } if score > 1 { score = 1 } s.persistenceUrgencyScore.Set(score) if s.rushed { // We are already in rushed mode. If the score is still above // persintenceUrgencyScoreForLeavingRushedMode, return 1 and // leave things as they are. if score > persintenceUrgencyScoreForLeavingRushedMode { return 1 } // We are out of rushed mode! s.rushed = false s.rushedMode.Set(0) log. With("urgencyScore", score). With("chunksToPersist", int(chunksToPersist)). With("maxChunksToPersist", int(maxChunksToPersist)). With("memoryChunks", int(memChunks)). With("maxMemoryChunks", int(maxMemChunks)). Info("Storage has left rushed mode.") return score } if score > persintenceUrgencyScoreForEnteringRushedMode { // Enter rushed mode. s.rushed = true s.rushedMode.Set(1) log. With("urgencyScore", score). With("chunksToPersist", int(chunksToPersist)). With("maxChunksToPersist", int(maxChunksToPersist)). With("memoryChunks", int(memChunks)). With("maxMemoryChunks", int(maxMemChunks)). Warn("Storage has entered rushed mode.") return 1 } return score }
// loadSeriesMapAndHeads loads the fingerprint to memory-series mapping and all // the chunks contained in the checkpoint (and thus not yet persisted to series // files). The method is capable of loading the checkpoint format v1 and v2. If // recoverable corruption is detected, or if the dirty flag was set from the // beginning, crash recovery is run, which might take a while. If an // unrecoverable error is encountered, it is returned. Call this method during // start-up while nothing else is running in storage land. This method is // utterly goroutine-unsafe. func (p *persistence) loadSeriesMapAndHeads() (sm *seriesMap, chunksToPersist int64, err error) { fingerprintToSeries := make(map[model.Fingerprint]*memorySeries) sm = &seriesMap{m: fingerprintToSeries} defer func() { if p.dirty { log.Warn("Persistence layer appears dirty.") p.startedDirty.Set(1) err = p.recoverFromCrash(fingerprintToSeries) if err != nil { sm = nil } } else { p.startedDirty.Set(0) } }() hs := newHeadsScanner(p.headsFileName()) defer hs.close() for hs.scan() { fingerprintToSeries[hs.fp] = hs.series } if os.IsNotExist(hs.err) { return sm, 0, nil } if hs.err != nil { p.dirty = true log. With("file", p.headsFileName()). With("error", hs.err). Error("Error reading heads file.") return sm, 0, hs.err } return sm, hs.chunksToPersistTotal, nil }
// Notify implements the Notifier interface. func (n *OpsGenie) Notify(ctx context.Context, as ...*types.Alert) error { key, ok := GroupKey(ctx) if !ok { return fmt.Errorf("group key missing") } data := n.tmpl.Data(receiver(ctx), groupLabels(ctx), as...) log.With("incident", key).Debugln("notifying OpsGenie") var err error tmpl := tmplText(n.tmpl, data, &err) details := make(map[string]string, len(n.conf.Details)) for k, v := range n.conf.Details { details[k] = tmpl(v) } var ( msg interface{} apiURL string apiMsg = opsGenieMessage{ APIKey: string(n.conf.APIKey), Alias: key, } alerts = types.Alerts(as...) ) switch alerts.Status() { case model.AlertResolved: apiURL = n.conf.APIHost + "v1/json/alert/close" msg = &opsGenieCloseMessage{&apiMsg} default: apiURL = n.conf.APIHost + "v1/json/alert" msg = &opsGenieCreateMessage{ opsGenieMessage: &apiMsg, Message: tmpl(n.conf.Description), Details: details, Source: tmpl(n.conf.Source), } } if err != nil { return fmt.Errorf("templating error: %s", err) } var buf bytes.Buffer if err := json.NewEncoder(&buf).Encode(msg); err != nil { return err } resp, err := ctxhttp.Post(ctx, http.DefaultClient, apiURL, contentTypeJSON, &buf) if err != nil { return err } resp.Body.Close() if resp.StatusCode/100 != 2 { return fmt.Errorf("unexpected status code %v", resp.StatusCode) } return nil }
// Sync extracts a deduplicated set of Alertmanager endpoints from a list // of target groups definitions. func (s *alertmanagerSet) Sync(tgs []*config.TargetGroup) { all := []alertmanager{} for _, tg := range tgs { ams, err := alertmanagerFromGroup(tg, s.cfg) if err != nil { log.With("err", err).Error("generating discovered Alertmanagers failed") continue } all = append(all, ams...) } s.mtx.Lock() defer s.mtx.Unlock() // Set new Alertmanagers and deduplicate them along their unique URL. s.ams = []alertmanager{} seen := map[string]struct{}{} for _, am := range all { us := am.url() if _, ok := seen[us]; ok { continue } seen[us] = struct{}{} s.ams = append(s.ams, am) } }
func init() { runtime.ErrorHandlers = []func(error){ func(err error) { log.With("component", "kube_client_runtime").Errorln(err) }, } }
// NewDispatcher returns a new Dispatcher. func NewDispatcher(ap provider.Alerts, r *Route, n notify.Notifier, mk types.Marker) *Dispatcher { disp := &Dispatcher{ alerts: ap, notifier: n, route: r, marker: mk, log: log.With("component", "dispatcher"), } return disp }
// Notify implements the Notifier interface. // // http://developer.pagerduty.com/documentation/integration/events/trigger func (n *PagerDuty) Notify(ctx context.Context, as ...*types.Alert) error { key, ok := GroupKey(ctx) if !ok { return fmt.Errorf("group key missing") } var err error var ( alerts = types.Alerts(as...) data = n.tmpl.Data(receiver(ctx), groupLabels(ctx), as...) tmpl = tmplText(n.tmpl, data, &err) eventType = pagerDutyEventTrigger ) if alerts.Status() == model.AlertResolved { eventType = pagerDutyEventResolve } log.With("incident", key).With("eventType", eventType).Debugln("notifying PagerDuty") details := make(map[string]string, len(n.conf.Details)) for k, v := range n.conf.Details { details[k] = tmpl(v) } msg := &pagerDutyMessage{ ServiceKey: tmpl(string(n.conf.ServiceKey)), EventType: eventType, IncidentKey: key, Description: tmpl(n.conf.Description), Details: details, } if eventType == pagerDutyEventTrigger { msg.Client = tmpl(n.conf.Client) msg.ClientURL = tmpl(n.conf.ClientURL) } if err != nil { return err } var buf bytes.Buffer if err := json.NewEncoder(&buf).Encode(msg); err != nil { return err } resp, err := ctxhttp.Post(ctx, http.DefaultClient, n.conf.URL, contentTypeJSON, &buf) if err != nil { return err } resp.Body.Close() if resp.StatusCode/100 != 2 { return fmt.Errorf("unexpected status code %v", resp.StatusCode) } return nil }
// sendAll sends the alerts to all configured Alertmanagers at concurrently. // It returns the number of sends that have failed. func (n *Notifier) sendAll(alerts ...*model.Alert) int { begin := time.Now() // Attach external labels before sending alerts. for _, a := range alerts { for ln, lv := range n.opts.ExternalLabels { if _, ok := a.Labels[ln]; !ok { a.Labels[ln] = lv } } } b, err := json.Marshal(alerts) if err != nil { log.Errorf("Encoding alerts failed: %s", err) return len(n.opts.AlertmanagerURLs) } ctx, _ := context.WithTimeout(context.Background(), n.opts.Timeout) send := func(u string) error { resp, err := ctxhttp.Post(ctx, http.DefaultClient, postURL(u), contentTypeJSON, bytes.NewReader(b)) if err != nil { return err } defer resp.Body.Close() if resp.StatusCode/100 != 2 { return fmt.Errorf("bad response status %v", resp.Status) } return err } var ( wg sync.WaitGroup numErrors uint64 ) for _, u := range n.opts.AlertmanagerURLs { wg.Add(1) go func(u string) { if err := send(u); err != nil { log.With("alertmanager", u).With("count", fmt.Sprintf("%d", len(alerts))).Errorf("Error sending alerts: %s", err) n.errors.WithLabelValues(u).Inc() atomic.AddUint64(&numErrors, 1) } n.latency.WithLabelValues(u).Observe(float64(time.Since(begin)) / float64(time.Second)) n.sent.WithLabelValues(u).Add(float64(len(alerts))) wg.Done() }(u) } wg.Wait() return int(numErrors) }
// setDirty flags the storage as dirty in a goroutine-safe way. The provided // error will be logged as a reason the first time the storage is flagged as dirty. func (p *persistence) setDirty(err error) { p.dirtyCounter.Inc() p.dirtyMtx.Lock() defer p.dirtyMtx.Unlock() if p.becameDirty { return } p.dirty = true p.becameDirty = true log.With("error", err).Error("The storage is now inconsistent. Restart Prometheus ASAP to initiate recovery.") }
func serveStaticAsset(w http.ResponseWriter, req *http.Request) { fp := route.Param(route.Context(req), "filepath") fp = filepath.Join("web/ui/static", fp) info, err := ui.AssetInfo(fp) if err != nil { log.With("file", fp).Warn("Could not get file info: ", err) w.WriteHeader(http.StatusNotFound) return } file, err := ui.Asset(fp) if err != nil { if err != io.EOF { log.With("file", fp).Warn("Could not get file: ", err) } w.WriteHeader(http.StatusNotFound) return } http.ServeContent(w, req, info.Name(), info.ModTime(), bytes.NewReader(file)) }
func lookupAll(name string, qtype uint16) (*dns.Msg, error) { conf, err := dns.ClientConfigFromFile(resolvConf) if err != nil { return nil, fmt.Errorf("could not load resolv.conf: %s", err) } client := &dns.Client{} response := &dns.Msg{} for _, server := range conf.Servers { servAddr := net.JoinHostPort(server, conf.Port) for _, suffix := range conf.Search { response, err = lookup(name, qtype, client, servAddr, suffix, false) if err != nil { log. With("server", server). With("name", name). With("suffix", suffix). With("reason", err). Warn("DNS resolution failed.") continue } if len(response.Answer) > 0 { return response, nil } } response, err = lookup(name, qtype, client, servAddr, "", false) if err == nil { return response, nil } log. With("server", server). With("name", name). With("reason", err). Warn("DNS resolution failed.") } return response, fmt.Errorf("could not resolve %s: no server responded", name) }
// quarantineSeries registers the provided fingerprint for quarantining. It // always returns immediately. Quarantine requests are processed // asynchronously. If there are too many requests queued, they are simply // dropped. // // Quarantining means that the series file is moved to the orphaned directory, // and all its traces are removed from indices. Call this method if an // unrecoverable error is detected while dealing with a series, and pass in the // encountered error. It will be saved as a hint in the orphaned directory. func (s *memorySeriesStorage) quarantineSeries(fp model.Fingerprint, metric model.Metric, err error) { req := quarantineRequest{fp: fp, metric: metric, reason: err} select { case s.quarantineRequests <- req: // Request submitted. default: log. With("fingerprint", fp). With("metric", metric). With("reason", err). Warn("Quarantine queue full. Dropped quarantine request.") s.seriesOps.WithLabelValues(droppedQuarantine).Inc() } }
// newAggrGroup returns a new aggregation group. func newAggrGroup(ctx context.Context, labels model.LabelSet, opts *RouteOpts) *aggrGroup { ag := &aggrGroup{ labels: labels, opts: opts, alerts: map[model.Fingerprint]*types.Alert{}, } ag.ctx, ag.cancel = context.WithCancel(ctx) ag.log = log.With("aggrGroup", ag) // Set an initial one-time wait before flushing // the first batch of notifications. ag.next = time.NewTimer(ag.opts.GroupWait) return ag }
func (sl *scrapeLoop) append(samples model.Samples) { numOutOfOrder := 0 for _, s := range samples { if err := sl.appender.Append(s); err != nil { if err == local.ErrOutOfOrderSample { numOutOfOrder++ } else { log.Warnf("Error inserting sample: %s", err) } } } if numOutOfOrder > 0 { log.With("numDropped", numOutOfOrder).Warn("Error on ingesting out-of-order samples") } }
// sendAll sends the alerts to all configured Alertmanagers concurrently. // It returns true if the alerts could be sent successfully to at least one Alertmanager. func (n *Notifier) sendAll(alerts ...*model.Alert) bool { begin := time.Now() b, err := json.Marshal(alerts) if err != nil { log.Errorf("Encoding alerts failed: %s", err) return false } n.mtx.RLock() amSets := n.alertmanagers n.mtx.RUnlock() var ( wg sync.WaitGroup numSuccess uint64 ) for _, ams := range amSets { ams.mtx.RLock() for _, am := range ams.ams { wg.Add(1) ctx, cancel := context.WithTimeout(n.ctx, ams.cfg.Timeout) defer cancel() go func(am alertmanager) { u := am.url() if err := n.sendOne(ctx, ams.client, u, b); err != nil { log.With("alertmanager", u).With("count", len(alerts)).Errorf("Error sending alerts: %s", err) n.errors.WithLabelValues(u).Inc() } else { atomic.AddUint64(&numSuccess, 1) } n.latency.WithLabelValues(u).Observe(time.Since(begin).Seconds()) n.sent.WithLabelValues(u).Add(float64(len(alerts))) wg.Done() }(am) } ams.mtx.RUnlock() } wg.Wait() return numSuccess > 0 }
// NewDispatcher returns a new Dispatcher. func NewDispatcher( ap provider.Alerts, r *Route, s notify.Stage, mk types.Marker, to func(time.Duration) time.Duration, ) *Dispatcher { disp := &Dispatcher{ alerts: ap, stage: s, route: r, marker: mk, timeout: to, log: log.With("component", "dispatcher"), } return disp }
func (s *memorySeriesStorage) handleQuarantine() { for { select { case req := <-s.quarantineRequests: s.purgeSeries(req.fp, req.metric, req.reason) log. With("fingerprint", req.fp). With("metric", req.metric). With("reason", req.reason). Warn("Series quarantined.") case <-s.quarantineStopping: log.Info("Series quarantining stopped.") close(s.quarantineStopped) return } } }
func serveAsset(w http.ResponseWriter, req *http.Request, fp string) { info, err := ui.AssetInfo(fp) if err != nil { log.Warn("Could not get file: ", err) w.WriteHeader(http.StatusNotFound) return } file, err := ui.Asset(fp) if err != nil { if err != io.EOF { log.With("file", fp).Warn("Could not get file: ", err) } w.WriteHeader(http.StatusNotFound) return } http.ServeContent(w, req, info.Name(), info.ModTime(), bytes.NewReader(file)) }
// Sync converts target groups into actual scrape targets and synchronizes // the currently running scraper with the resulting set. func (sp *scrapePool) Sync(tgs []*config.TargetGroup) { start := time.Now() var all []*Target for _, tg := range tgs { targets, err := targetsFromGroup(tg, sp.config) if err != nil { log.With("err", err).Error("creating targets failed") continue } all = append(all, targets...) } sp.sync(all) targetSyncIntervalLength.WithLabelValues(sp.config.JobName).Observe( time.Since(start).Seconds(), ) targetScrapePoolSyncsCounter.WithLabelValues(sp.config.JobName).Inc() }
// purgeSeries removes all traces of a series. If a non-nil quarantine reason is // provided, the series file will not be deleted completely, but moved to the // orphaned directory with the reason and the metric in a hint file. The // provided metric might be nil if unknown. func (s *memorySeriesStorage) purgeSeries(fp model.Fingerprint, m model.Metric, quarantineReason error) { s.fpLocker.Lock(fp) var ( series *memorySeries ok bool ) if series, ok = s.fpToSeries.get(fp); ok { s.fpToSeries.del(fp) s.numSeries.Dec() m = series.metric // Adjust s.numChunksToPersist and numMemChunks down by // the number of chunks in this series that are not // persisted yet. Persisted chunks will be deducted from // numMemChunks upon eviction. numChunksNotYetPersisted := len(series.chunkDescs) - series.persistWatermark atomic.AddInt64(&numMemChunks, int64(-numChunksNotYetPersisted)) if !series.headChunkClosed { // Head chunk wasn't counted as waiting for persistence yet. // (But it was counted as a chunk in memory.) numChunksNotYetPersisted-- } s.incNumChunksToPersist(-numChunksNotYetPersisted) } else { s.persistence.purgeArchivedMetric(fp) // Ignoring error. There is nothing we can do. } if m != nil { // If we know a metric now, unindex it in any case. // purgeArchivedMetric might have done so already, but we cannot // be sure. Unindexing in idempotent, though. s.persistence.unindexMetric(fp, m) } // Attempt to delete/quarantine the series file in any case. if quarantineReason == nil { // No reason stated, simply delete the file. if _, err := s.persistence.deleteSeriesFile(fp); err != nil { log. With("fingerprint", fp). With("metric", m). With("error", err). Error("Error deleting series file.") } s.seriesOps.WithLabelValues(requestedPurge).Inc() } else { if err := s.persistence.quarantineSeriesFile(fp, quarantineReason, m); err == nil { s.seriesOps.WithLabelValues(completedQurantine).Inc() } else { s.seriesOps.WithLabelValues(failedQuarantine).Inc() log. With("fingerprint", fp). With("metric", m). With("reason", quarantineReason). With("error", err). Error("Error quarantining series file.") } } s.fpLocker.Unlock(fp) }
func main() { flag.Parse() if *showVersion { fmt.Fprintln(os.Stdout, version.Print("alertmanager")) os.Exit(0) } log.Infoln("Starting alertmanager", version.Info()) log.Infoln("Build context", version.BuildContext()) err := os.MkdirAll(*dataDir, 0777) if err != nil { log.Fatal(err) } marker := types.NewMarker() alerts, err := boltmem.NewAlerts(*dataDir) if err != nil { log.Fatal(err) } defer alerts.Close() notifies, err := boltmem.NewNotificationInfo(*dataDir) if err != nil { log.Fatal(err) } defer notifies.Close() silences, err := boltmem.NewSilences(*dataDir, marker) if err != nil { log.Fatal(err) } defer silences.Close() var ( inhibitor *Inhibitor tmpl *template.Template disp *Dispatcher ) defer disp.Stop() api := NewAPI(alerts, silences, func() AlertOverview { return disp.Groups() }) build := func(rcvs []*config.Receiver) notify.Notifier { var ( router = notify.Router{} fanouts = notify.Build(rcvs, tmpl) ) for name, fo := range fanouts { for i, n := range fo { n = notify.Retry(n) n = notify.Log(n, log.With("step", "retry")) n = notify.Dedup(notifies, n) n = notify.Log(n, log.With("step", "dedup")) fo[i] = n } router[name] = fo } n := notify.Notifier(router) n = notify.Log(n, log.With("step", "route")) n = notify.Silence(silences, n, marker) n = notify.Log(n, log.With("step", "silence")) n = notify.Inhibit(inhibitor, n, marker) n = notify.Log(n, log.With("step", "inhibit")) return n } amURL, err := extURL(*externalURL) if err != nil { log.Fatal(err) } reload := func() (err error) { log.With("file", *configFile).Infof("Loading configuration file") defer func() { if err != nil { log.With("file", *configFile).Errorf("Loading configuration file failed: %s", err) configSuccess.Set(0) } else { configSuccess.Set(1) configSuccessTime.Set(float64(time.Now().Unix())) } }() conf, err := config.LoadFile(*configFile) if err != nil { return err } api.Update(conf.String(), time.Duration(conf.Global.ResolveTimeout)) tmpl, err = template.FromGlobs(conf.Templates...) if err != nil { return err } tmpl.ExternalURL = amURL inhibitor.Stop() disp.Stop() inhibitor = NewInhibitor(alerts, conf.InhibitRules, marker) disp = NewDispatcher(alerts, NewRoute(conf.Route, nil), build(conf.Receivers), marker) go disp.Run() go inhibitor.Run() return nil } if err := reload(); err != nil { os.Exit(1) } router := route.New() webReload := make(chan struct{}) RegisterWeb(router.WithPrefix(amURL.Path), webReload) api.Register(router.WithPrefix(path.Join(amURL.Path, "/api"))) log.Infoln("Listening on", *listenAddress) go listen(router) var ( hup = make(chan os.Signal) hupReady = make(chan bool) term = make(chan os.Signal) ) signal.Notify(hup, syscall.SIGHUP) signal.Notify(term, os.Interrupt, syscall.SIGTERM) go func() { <-hupReady for { select { case <-hup: case <-webReload: } reload() } }() // Wait for reload or termination signals. close(hupReady) // Unblock SIGHUP handler. <-term log.Infoln("Received SIGTERM, exiting gracefully...") }
// eval runs a single evaluation cycle in which all rules are evaluated in parallel. // In the future a single group will be evaluated sequentially to properly handle // rule dependency. func (g *Group) eval() { var ( now = model.Now() wg sync.WaitGroup ) for _, rule := range g.rules { rtyp := string(typeForRule(rule)) wg.Add(1) // BUG(julius): Look at fixing thundering herd. go func(rule Rule) { defer wg.Done() defer func(t time.Time) { evalDuration.WithLabelValues(rtyp).Observe(time.Since(t).Seconds()) }(time.Now()) evalTotal.WithLabelValues(rtyp).Inc() vector, err := rule.eval(g.opts.Context, now, g.opts.QueryEngine, g.opts.ExternalURL.Path) if err != nil { // Canceled queries are intentional termination of queries. This normally // happens on shutdown and thus we skip logging of any errors here. if _, ok := err.(promql.ErrQueryCanceled); !ok { log.Warnf("Error while evaluating rule %q: %s", rule, err) } evalFailures.WithLabelValues(rtyp).Inc() return } if ar, ok := rule.(*AlertingRule); ok { g.sendAlerts(ar, now) } var ( numOutOfOrder = 0 numDuplicates = 0 ) for _, s := range vector { if err := g.opts.SampleAppender.Append(s); err != nil { switch err { case local.ErrOutOfOrderSample: numOutOfOrder++ log.With("sample", s).With("error", err).Debug("Rule evaluation result discarded") case local.ErrDuplicateSampleForTimestamp: numDuplicates++ log.With("sample", s).With("error", err).Debug("Rule evaluation result discarded") default: log.With("sample", s).With("error", err).Warn("Rule evaluation result discarded") } } } if numOutOfOrder > 0 { log.With("numDropped", numOutOfOrder).Warn("Error on ingesting out-of-order result from rule evaluation") } if numDuplicates > 0 { log.With("numDropped", numDuplicates).Warn("Error on ingesting results from rule evaluation with different value but same timestamp") } }(rule) } wg.Wait() }
// Notify implements the Notifier interface. func (n *OpsGenie) Notify(ctx context.Context, as ...*types.Alert) error { key, ok := GroupKey(ctx) if !ok { return fmt.Errorf("group key missing") } data := n.tmpl.Data(receiver(ctx), groupLabels(ctx), as...) log.With("incident", key).Debugln("notifying OpsGenie") var err error tmpl := tmplText(n.tmpl, data, &err) details := make(map[string]string, len(n.conf.Details)) for k, v := range n.conf.Details { details[k] = tmpl(v) } var ( msg interface{} apiURL string apiMsg = opsGenieMessage{ APIKey: string(n.conf.APIKey), Alias: key, } alerts = types.Alerts(as...) ) switch alerts.Status() { case model.AlertResolved: apiURL = n.conf.APIHost + "v1/json/alert/close" msg = &opsGenieCloseMessage{&apiMsg} default: apiURL = n.conf.APIHost + "v1/json/alert" msg = &opsGenieCreateMessage{ opsGenieMessage: &apiMsg, Message: tmpl(n.conf.Description), Details: details, Source: tmpl(n.conf.Source), Teams: tmpl(n.conf.Teams), Tags: tmpl(n.conf.Tags), } } if err != nil { return fmt.Errorf("templating error: %s", err) } var buf bytes.Buffer if err := json.NewEncoder(&buf).Encode(msg); err != nil { return err } resp, err := ctxhttp.Post(ctx, http.DefaultClient, apiURL, contentTypeJSON, &buf) if err != nil { return err } defer resp.Body.Close() if resp.StatusCode == 400 && alerts.Status() == model.AlertResolved { body, _ := ioutil.ReadAll(resp.Body) var responseMessage opsGenieErrorResponse if err := json.Unmarshal(body, &responseMessage); err != nil { return fmt.Errorf("could not parse error response %q", body) } const alreadyClosedError = 5 if responseMessage.Code == alreadyClosedError { return nil } return fmt.Errorf("error when closing alert: code %d, error %q", responseMessage.Code, responseMessage.Error) } else if resp.StatusCode/100 != 2 { body, _ := ioutil.ReadAll(resp.Body) log.With("incident", key).Debugf("unexpected OpsGenie response from %s (POSTed %s), %s: %s", apiURL, msg, resp.Status, body) return fmt.Errorf("unexpected status code %v", resp.StatusCode) } return nil }
func (s *BasicService) log() log.Logger { if s.host != nil { return s.host.log() } return log.With("logger_note", "no host") }
func (ts *targetSet) runProviders(ctx context.Context, providers map[string]TargetProvider) { // Lock for the entire time. This may mean up to 5 seconds until the full initial set // is retrieved and applied. // We could release earlier with some tweaks, but this is easier to reason about. ts.mtx.Lock() defer ts.mtx.Unlock() var wg sync.WaitGroup if ts.cancelProviders != nil { ts.cancelProviders() } ctx, ts.cancelProviders = context.WithCancel(ctx) // (Re-)create a fresh tgroups map to not keep stale targets around. We // will retrieve all targets below anyway, so cleaning up everything is // safe and doesn't inflict any additional cost. ts.tgroups = map[string][]*Target{} for name, prov := range providers { wg.Add(1) updates := make(chan []*config.TargetGroup) go func(name string, prov TargetProvider) { select { case <-ctx.Done(): case initial, ok := <-updates: // Handle the case that a target provider exits and closes the channel // before the context is done. if !ok { break } // First set of all targets the provider knows. for _, tgroup := range initial { if tgroup == nil { continue } targets, err := targetsFromGroup(tgroup, ts.config) if err != nil { log.With("target_group", tgroup).Errorf("Target update failed: %s", err) continue } ts.tgroups[name+"/"+tgroup.Source] = targets } case <-time.After(5 * time.Second): // Initial set didn't arrive. Act as if it was empty // and wait for updates later on. } wg.Done() // Start listening for further updates. for { select { case <-ctx.Done(): return case tgs, ok := <-updates: // Handle the case that a target provider exits and closes the channel // before the context is done. if !ok { return } for _, tg := range tgs { if err := ts.update(name, tg); err != nil { log.With("target_group", tg).Errorf("Target update failed: %s", err) } } } } }(name, prov) go prov.Run(ctx, updates) } // We wait for a full initial set of target groups before releasing the mutex // to ensure the initial sync is complete and there are no races with subsequent updates. wg.Wait() // Just signal that there are initial sets to sync now. Actual syncing must only // happen in the runScraping loop. select { case ts.syncCh <- struct{}{}: default: } }
func (h *Handler) federation(w http.ResponseWriter, req *http.Request) { h.mtx.RLock() defer h.mtx.RUnlock() req.ParseForm() var matcherSets []metric.LabelMatchers for _, s := range req.Form["match[]"] { matchers, err := promql.ParseMetricSelector(s) if err != nil { http.Error(w, err.Error(), http.StatusBadRequest) return } matcherSets = append(matcherSets, matchers) } var ( minTimestamp = h.now().Add(-promql.StalenessDelta) format = expfmt.Negotiate(req.Header) enc = expfmt.NewEncoder(w, format) ) w.Header().Set("Content-Type", string(format)) q, err := h.storage.Querier() if err != nil { federationErrors.Inc() http.Error(w, err.Error(), http.StatusInternalServerError) return } defer q.Close() vector, err := q.LastSampleForLabelMatchers(h.context, minTimestamp, matcherSets...) if err != nil { federationErrors.Inc() http.Error(w, err.Error(), http.StatusInternalServerError) return } sort.Sort(byName(vector)) var ( lastMetricName model.LabelValue protMetricFam *dto.MetricFamily ) for _, s := range vector { nameSeen := false globalUsed := map[model.LabelName]struct{}{} protMetric := &dto.Metric{ Untyped: &dto.Untyped{}, } for ln, lv := range s.Metric { if lv == "" { // No value means unset. Never consider those labels. // This is also important to protect against nameless metrics. continue } if ln == model.MetricNameLabel { nameSeen = true if lv == lastMetricName { // We already have the name in the current MetricFamily, // and we ignore nameless metrics. continue } // Need to start a new MetricFamily. Ship off the old one (if any) before // creating the new one. if protMetricFam != nil { if err := enc.Encode(protMetricFam); err != nil { federationErrors.Inc() log.With("err", err).Error("federation failed") return } } protMetricFam = &dto.MetricFamily{ Type: dto.MetricType_UNTYPED.Enum(), Name: proto.String(string(lv)), } lastMetricName = lv continue } protMetric.Label = append(protMetric.Label, &dto.LabelPair{ Name: proto.String(string(ln)), Value: proto.String(string(lv)), }) if _, ok := h.externalLabels[ln]; ok { globalUsed[ln] = struct{}{} } } if !nameSeen { log.With("metric", s.Metric).Warn("Ignoring nameless metric during federation.") continue } // Attach global labels if they do not exist yet. for ln, lv := range h.externalLabels { if _, ok := globalUsed[ln]; !ok { protMetric.Label = append(protMetric.Label, &dto.LabelPair{ Name: proto.String(string(ln)), Value: proto.String(string(lv)), }) } } protMetric.TimestampMs = proto.Int64(int64(s.Timestamp)) protMetric.Untyped.Value = proto.Float64(float64(s.Value)) protMetricFam.Metric = append(protMetricFam.Metric, protMetric) } // Still have to ship off the last MetricFamily, if any. if protMetricFam != nil { if err := enc.Encode(protMetricFam); err != nil { federationErrors.Inc() log.With("err", err).Error("federation failed") } } }