Example #1
0
func (sl *scrapeLoop) append(samples model.Samples) {
	var (
		numOutOfOrder = 0
		numDuplicates = 0
	)

	for _, s := range samples {
		if err := sl.appender.Append(s); err != nil {
			switch err {
			case local.ErrOutOfOrderSample:
				numOutOfOrder++
				log.With("sample", s).With("error", err).Debug("Sample discarded")
			case local.ErrDuplicateSampleForTimestamp:
				numDuplicates++
				log.With("sample", s).With("error", err).Debug("Sample discarded")
			default:
				log.With("sample", s).With("error", err).Warn("Sample discarded")
			}
		}
	}
	if numOutOfOrder > 0 {
		log.With("numDropped", numOutOfOrder).Warn("Error on ingesting out-of-order samples")
	}
	if numDuplicates > 0 {
		log.With("numDropped", numDuplicates).Warn("Error on ingesting samples with different value but same timestamp")
	}
}
Example #2
0
func (sl *scrapeLoop) report(start time.Time, duration time.Duration, err error) {
	sl.scraper.report(start, duration, err)

	ts := model.TimeFromUnixNano(start.UnixNano())

	var health model.SampleValue
	if err == nil {
		health = 1
	}

	healthSample := &model.Sample{
		Metric: model.Metric{
			model.MetricNameLabel: scrapeHealthMetricName,
		},
		Timestamp: ts,
		Value:     health,
	}
	durationSample := &model.Sample{
		Metric: model.Metric{
			model.MetricNameLabel: scrapeDurationMetricName,
		},
		Timestamp: ts,
		Value:     model.SampleValue(float64(duration) / float64(time.Second)),
	}

	if err := sl.reportAppender.Append(healthSample); err != nil {
		log.With("sample", healthSample).With("error", err).Warn("Scrape health sample discarded")
	}
	if err := sl.reportAppender.Append(durationSample); err != nil {
		log.With("sample", durationSample).With("error", err).Warn("Scrape duration sample discarded")
	}
}
Example #3
0
// logThrottling handles logging of throttled events and has to be started as a
// goroutine. It stops once s.loopStopping is closed.
//
// Logging strategy: Whenever Throttle() is called and returns true, an signal
// is sent to s.throttled. If that happens for the first time, an Error is
// logged that the storage is now throttled. As long as signals continues to be
// sent via s.throttled at least once per minute, nothing else is logged. Once
// no signal has arrived for a minute, an Info is logged that the storage is not
// throttled anymore. This resets things to the initial state, i.e. once a
// signal arrives again, the Error will be logged again.
func (s *memorySeriesStorage) logThrottling() {
	timer := time.NewTimer(time.Minute)
	timer.Stop()

	// Signal exit of the goroutine. Currently only needed by test code.
	defer close(s.logThrottlingStopped)

	for {
		select {
		case <-s.throttled:
			if !timer.Reset(time.Minute) {
				log.
					With("chunksToPersist", s.getNumChunksToPersist()).
					With("maxChunksToPersist", s.maxChunksToPersist).
					With("memoryChunks", atomic.LoadInt64(&numMemChunks)).
					With("maxToleratedMemChunks", int(float64(s.maxMemoryChunks)*toleranceFactorMemChunks)).
					Error("Storage needs throttling. Scrapes and rule evaluations will be skipped.")
			}
		case <-timer.C:
			log.
				With("chunksToPersist", s.getNumChunksToPersist()).
				With("maxChunksToPersist", s.maxChunksToPersist).
				With("memoryChunks", atomic.LoadInt64(&numMemChunks)).
				With("maxToleratedMemChunks", int(float64(s.maxMemoryChunks)*toleranceFactorMemChunks)).
				Info("Storage does not need throttling anymore.")
		case <-s.loopStopping:
			return
		}
	}
}
Example #4
0
// Notify implements the Notifier interface.
func (n *Pushover) Notify(ctx context.Context, as ...*types.Alert) error {
	key, ok := GroupKey(ctx)
	if !ok {
		return fmt.Errorf("group key missing")
	}
	data := n.tmpl.Data(receiver(ctx), groupLabels(ctx), as...)

	log.With("incident", key).Debugln("notifying Pushover")

	var err error
	tmpl := tmplText(n.tmpl, data, &err)

	parameters := url.Values{}
	parameters.Add("token", tmpl(string(n.conf.Token)))
	parameters.Add("user", tmpl(string(n.conf.UserKey)))
	title := tmpl(n.conf.Title)
	message := tmpl(n.conf.Message)
	parameters.Add("title", title)
	if len(title) > 512 {
		title = title[:512]
		log.With("incident", key).Debugf("Truncated title to %q due to Pushover message limit", title)
	}
	if len(title)+len(message) > 512 {
		message = message[:512-len(title)]
		log.With("incident", key).Debugf("Truncated message to %q due to Pushover message limit", message)
	}
	message = strings.TrimSpace(message)
	if message == "" {
		// Pushover rejects empty messages.
		message = "(no details)"
	}
	parameters.Add("message", message)
	parameters.Add("url", tmpl(n.conf.URL))
	parameters.Add("priority", tmpl(n.conf.Priority))
	parameters.Add("retry", fmt.Sprintf("%d", int64(time.Duration(n.conf.Retry).Seconds())))
	parameters.Add("expire", fmt.Sprintf("%d", int64(time.Duration(n.conf.Expire).Seconds())))

	apiURL := "https://api.pushover.net/1/messages.json"
	u, err := url.Parse(apiURL)
	if err != nil {
		return err
	}
	u.RawQuery = parameters.Encode()
	log.With("incident", key).Debugf("Pushover URL = %q", u.String())

	resp, err := ctxhttp.Post(ctx, http.DefaultClient, u.String(), "text/plain", nil)
	if err != nil {
		return err
	}
	defer resp.Body.Close()

	if resp.StatusCode/100 != 2 {
		body, err := ioutil.ReadAll(resp.Body)
		if err != nil {
			return err
		}
		return fmt.Errorf("unexpected status code %v (body: %s)", resp.StatusCode, string(body))
	}
	return nil
}
Example #5
0
// calculatePersistenceUrgencyScore calculates and returns an urgency score for
// the speed of persisting chunks. The score is between 0 and 1, where 0 means
// no urgency at all and 1 means highest urgency.
//
// The score is the maximum of the two following sub-scores:
//
// (1) The first sub-score is the number of chunks waiting for persistence
// divided by the maximum number of chunks allowed to be waiting for
// persistence.
//
// (2) If there are more chunks in memory than allowed AND there are more chunks
// waiting for persistence than factorMinChunksToPersist times
// -storage.local.max-chunks-to-persist, then the second sub-score is the
// fraction the number of memory chunks has reached between
// -storage.local.memory-chunks and toleranceFactorForMemChunks times
// -storage.local.memory-chunks.
//
// Should the score ever hit persintenceUrgencyScoreForEnteringRushedMode, the
// storage locks into "rushed mode", in which the returned score is always
// bumped up to 1 until the non-bumped score is below
// persintenceUrgencyScoreForLeavingRushedMode.
//
// This method is not goroutine-safe, but it is only ever called by the single
// goroutine that is in charge of series maintenance. According to the returned
// score, series maintenance should be sped up. If a score of 1 is returned,
// checkpointing based on dirty-series count should be disabled, and series
// files should not by synced anymore provided the user has specified the
// adaptive sync strategy.
func (s *memorySeriesStorage) calculatePersistenceUrgencyScore() float64 {
	s.rushedMtx.Lock()
	defer s.rushedMtx.Unlock()

	var (
		chunksToPersist    = float64(s.getNumChunksToPersist())
		maxChunksToPersist = float64(s.maxChunksToPersist)
		memChunks          = float64(atomic.LoadInt64(&numMemChunks))
		maxMemChunks       = float64(s.maxMemoryChunks)
	)
	score := chunksToPersist / maxChunksToPersist
	if chunksToPersist > maxChunksToPersist*factorMinChunksToPersist {
		score = math.Max(
			score,
			(memChunks/maxMemChunks-1)/(toleranceFactorMemChunks-1),
		)
	}
	if score > 1 {
		score = 1
	}
	s.persistenceUrgencyScore.Set(score)

	if s.rushed {
		// We are already in rushed mode. If the score is still above
		// persintenceUrgencyScoreForLeavingRushedMode, return 1 and
		// leave things as they are.
		if score > persintenceUrgencyScoreForLeavingRushedMode {
			return 1
		}
		// We are out of rushed mode!
		s.rushed = false
		s.rushedMode.Set(0)
		log.
			With("urgencyScore", score).
			With("chunksToPersist", int(chunksToPersist)).
			With("maxChunksToPersist", int(maxChunksToPersist)).
			With("memoryChunks", int(memChunks)).
			With("maxMemoryChunks", int(maxMemChunks)).
			Info("Storage has left rushed mode.")
		return score
	}
	if score > persintenceUrgencyScoreForEnteringRushedMode {
		// Enter rushed mode.
		s.rushed = true
		s.rushedMode.Set(1)
		log.
			With("urgencyScore", score).
			With("chunksToPersist", int(chunksToPersist)).
			With("maxChunksToPersist", int(maxChunksToPersist)).
			With("memoryChunks", int(memChunks)).
			With("maxMemoryChunks", int(maxMemChunks)).
			Warn("Storage has entered rushed mode.")
		return 1
	}
	return score
}
Example #6
0
// loadSeriesMapAndHeads loads the fingerprint to memory-series mapping and all
// the chunks contained in the checkpoint (and thus not yet persisted to series
// files). The method is capable of loading the checkpoint format v1 and v2. If
// recoverable corruption is detected, or if the dirty flag was set from the
// beginning, crash recovery is run, which might take a while. If an
// unrecoverable error is encountered, it is returned. Call this method during
// start-up while nothing else is running in storage land. This method is
// utterly goroutine-unsafe.
func (p *persistence) loadSeriesMapAndHeads() (sm *seriesMap, chunksToPersist int64, err error) {
	fingerprintToSeries := make(map[model.Fingerprint]*memorySeries)
	sm = &seriesMap{m: fingerprintToSeries}

	defer func() {
		if p.dirty {
			log.Warn("Persistence layer appears dirty.")
			p.startedDirty.Set(1)
			err = p.recoverFromCrash(fingerprintToSeries)
			if err != nil {
				sm = nil
			}
		} else {
			p.startedDirty.Set(0)
		}
	}()

	hs := newHeadsScanner(p.headsFileName())
	defer hs.close()
	for hs.scan() {
		fingerprintToSeries[hs.fp] = hs.series
	}
	if os.IsNotExist(hs.err) {
		return sm, 0, nil
	}
	if hs.err != nil {
		p.dirty = true
		log.
			With("file", p.headsFileName()).
			With("error", hs.err).
			Error("Error reading heads file.")
		return sm, 0, hs.err
	}
	return sm, hs.chunksToPersistTotal, nil
}
Example #7
0
// Notify implements the Notifier interface.
func (n *OpsGenie) Notify(ctx context.Context, as ...*types.Alert) error {
	key, ok := GroupKey(ctx)
	if !ok {
		return fmt.Errorf("group key missing")
	}
	data := n.tmpl.Data(receiver(ctx), groupLabels(ctx), as...)

	log.With("incident", key).Debugln("notifying OpsGenie")

	var err error
	tmpl := tmplText(n.tmpl, data, &err)

	details := make(map[string]string, len(n.conf.Details))
	for k, v := range n.conf.Details {
		details[k] = tmpl(v)
	}

	var (
		msg    interface{}
		apiURL string

		apiMsg = opsGenieMessage{
			APIKey: string(n.conf.APIKey),
			Alias:  key,
		}
		alerts = types.Alerts(as...)
	)
	switch alerts.Status() {
	case model.AlertResolved:
		apiURL = n.conf.APIHost + "v1/json/alert/close"
		msg = &opsGenieCloseMessage{&apiMsg}
	default:
		apiURL = n.conf.APIHost + "v1/json/alert"
		msg = &opsGenieCreateMessage{
			opsGenieMessage: &apiMsg,
			Message:         tmpl(n.conf.Description),
			Details:         details,
			Source:          tmpl(n.conf.Source),
		}
	}
	if err != nil {
		return fmt.Errorf("templating error: %s", err)
	}

	var buf bytes.Buffer
	if err := json.NewEncoder(&buf).Encode(msg); err != nil {
		return err
	}

	resp, err := ctxhttp.Post(ctx, http.DefaultClient, apiURL, contentTypeJSON, &buf)
	if err != nil {
		return err
	}
	resp.Body.Close()

	if resp.StatusCode/100 != 2 {
		return fmt.Errorf("unexpected status code %v", resp.StatusCode)
	}
	return nil
}
Example #8
0
// Sync extracts a deduplicated set of Alertmanager endpoints from a list
// of target groups definitions.
func (s *alertmanagerSet) Sync(tgs []*config.TargetGroup) {
	all := []alertmanager{}

	for _, tg := range tgs {
		ams, err := alertmanagerFromGroup(tg, s.cfg)
		if err != nil {
			log.With("err", err).Error("generating discovered Alertmanagers failed")
			continue
		}
		all = append(all, ams...)
	}

	s.mtx.Lock()
	defer s.mtx.Unlock()
	// Set new Alertmanagers and deduplicate them along their unique URL.
	s.ams = []alertmanager{}
	seen := map[string]struct{}{}

	for _, am := range all {
		us := am.url()
		if _, ok := seen[us]; ok {
			continue
		}

		seen[us] = struct{}{}
		s.ams = append(s.ams, am)
	}
}
Example #9
0
func init() {
	runtime.ErrorHandlers = []func(error){
		func(err error) {
			log.With("component", "kube_client_runtime").Errorln(err)
		},
	}
}
Example #10
0
// NewDispatcher returns a new Dispatcher.
func NewDispatcher(ap provider.Alerts, r *Route, n notify.Notifier, mk types.Marker) *Dispatcher {
	disp := &Dispatcher{
		alerts:   ap,
		notifier: n,
		route:    r,
		marker:   mk,
		log:      log.With("component", "dispatcher"),
	}
	return disp
}
Example #11
0
// Notify implements the Notifier interface.
//
// http://developer.pagerduty.com/documentation/integration/events/trigger
func (n *PagerDuty) Notify(ctx context.Context, as ...*types.Alert) error {
	key, ok := GroupKey(ctx)
	if !ok {
		return fmt.Errorf("group key missing")
	}

	var err error
	var (
		alerts    = types.Alerts(as...)
		data      = n.tmpl.Data(receiver(ctx), groupLabels(ctx), as...)
		tmpl      = tmplText(n.tmpl, data, &err)
		eventType = pagerDutyEventTrigger
	)
	if alerts.Status() == model.AlertResolved {
		eventType = pagerDutyEventResolve
	}

	log.With("incident", key).With("eventType", eventType).Debugln("notifying PagerDuty")

	details := make(map[string]string, len(n.conf.Details))
	for k, v := range n.conf.Details {
		details[k] = tmpl(v)
	}

	msg := &pagerDutyMessage{
		ServiceKey:  tmpl(string(n.conf.ServiceKey)),
		EventType:   eventType,
		IncidentKey: key,
		Description: tmpl(n.conf.Description),
		Details:     details,
	}
	if eventType == pagerDutyEventTrigger {
		msg.Client = tmpl(n.conf.Client)
		msg.ClientURL = tmpl(n.conf.ClientURL)
	}
	if err != nil {
		return err
	}

	var buf bytes.Buffer
	if err := json.NewEncoder(&buf).Encode(msg); err != nil {
		return err
	}

	resp, err := ctxhttp.Post(ctx, http.DefaultClient, n.conf.URL, contentTypeJSON, &buf)
	if err != nil {
		return err
	}
	resp.Body.Close()

	if resp.StatusCode/100 != 2 {
		return fmt.Errorf("unexpected status code %v", resp.StatusCode)
	}
	return nil
}
Example #12
0
// sendAll sends the alerts to all configured Alertmanagers at concurrently.
// It returns the number of sends that have failed.
func (n *Notifier) sendAll(alerts ...*model.Alert) int {
	begin := time.Now()

	// Attach external labels before sending alerts.
	for _, a := range alerts {
		for ln, lv := range n.opts.ExternalLabels {
			if _, ok := a.Labels[ln]; !ok {
				a.Labels[ln] = lv
			}
		}
	}

	b, err := json.Marshal(alerts)
	if err != nil {
		log.Errorf("Encoding alerts failed: %s", err)
		return len(n.opts.AlertmanagerURLs)
	}
	ctx, _ := context.WithTimeout(context.Background(), n.opts.Timeout)

	send := func(u string) error {
		resp, err := ctxhttp.Post(ctx, http.DefaultClient, postURL(u), contentTypeJSON, bytes.NewReader(b))
		if err != nil {
			return err
		}
		defer resp.Body.Close()

		if resp.StatusCode/100 != 2 {
			return fmt.Errorf("bad response status %v", resp.Status)
		}
		return err
	}

	var (
		wg        sync.WaitGroup
		numErrors uint64
	)
	for _, u := range n.opts.AlertmanagerURLs {
		wg.Add(1)

		go func(u string) {
			if err := send(u); err != nil {
				log.With("alertmanager", u).With("count", fmt.Sprintf("%d", len(alerts))).Errorf("Error sending alerts: %s", err)
				n.errors.WithLabelValues(u).Inc()
				atomic.AddUint64(&numErrors, 1)
			}
			n.latency.WithLabelValues(u).Observe(float64(time.Since(begin)) / float64(time.Second))
			n.sent.WithLabelValues(u).Add(float64(len(alerts)))

			wg.Done()
		}(u)
	}
	wg.Wait()

	return int(numErrors)
}
Example #13
0
// setDirty flags the storage as dirty in a goroutine-safe way. The provided
// error will be logged as a reason the first time the storage is flagged as dirty.
func (p *persistence) setDirty(err error) {
	p.dirtyCounter.Inc()
	p.dirtyMtx.Lock()
	defer p.dirtyMtx.Unlock()
	if p.becameDirty {
		return
	}
	p.dirty = true
	p.becameDirty = true
	log.With("error", err).Error("The storage is now inconsistent. Restart Prometheus ASAP to initiate recovery.")
}
Example #14
0
func serveStaticAsset(w http.ResponseWriter, req *http.Request) {
	fp := route.Param(route.Context(req), "filepath")
	fp = filepath.Join("web/ui/static", fp)

	info, err := ui.AssetInfo(fp)
	if err != nil {
		log.With("file", fp).Warn("Could not get file info: ", err)
		w.WriteHeader(http.StatusNotFound)
		return
	}
	file, err := ui.Asset(fp)
	if err != nil {
		if err != io.EOF {
			log.With("file", fp).Warn("Could not get file: ", err)
		}
		w.WriteHeader(http.StatusNotFound)
		return
	}

	http.ServeContent(w, req, info.Name(), info.ModTime(), bytes.NewReader(file))
}
Example #15
0
func lookupAll(name string, qtype uint16) (*dns.Msg, error) {
	conf, err := dns.ClientConfigFromFile(resolvConf)
	if err != nil {
		return nil, fmt.Errorf("could not load resolv.conf: %s", err)
	}

	client := &dns.Client{}
	response := &dns.Msg{}

	for _, server := range conf.Servers {
		servAddr := net.JoinHostPort(server, conf.Port)
		for _, suffix := range conf.Search {
			response, err = lookup(name, qtype, client, servAddr, suffix, false)
			if err != nil {
				log.
					With("server", server).
					With("name", name).
					With("suffix", suffix).
					With("reason", err).
					Warn("DNS resolution failed.")
				continue
			}
			if len(response.Answer) > 0 {
				return response, nil
			}
		}
		response, err = lookup(name, qtype, client, servAddr, "", false)
		if err == nil {
			return response, nil
		}
		log.
			With("server", server).
			With("name", name).
			With("reason", err).
			Warn("DNS resolution failed.")
	}
	return response, fmt.Errorf("could not resolve %s: no server responded", name)
}
Example #16
0
// quarantineSeries registers the provided fingerprint for quarantining. It
// always returns immediately. Quarantine requests are processed
// asynchronously. If there are too many requests queued, they are simply
// dropped.
//
// Quarantining means that the series file is moved to the orphaned directory,
// and all its traces are removed from indices. Call this method if an
// unrecoverable error is detected while dealing with a series, and pass in the
// encountered error. It will be saved as a hint in the orphaned directory.
func (s *memorySeriesStorage) quarantineSeries(fp model.Fingerprint, metric model.Metric, err error) {
	req := quarantineRequest{fp: fp, metric: metric, reason: err}
	select {
	case s.quarantineRequests <- req:
		// Request submitted.
	default:
		log.
			With("fingerprint", fp).
			With("metric", metric).
			With("reason", err).
			Warn("Quarantine queue full. Dropped quarantine request.")
		s.seriesOps.WithLabelValues(droppedQuarantine).Inc()
	}
}
Example #17
0
// newAggrGroup returns a new aggregation group.
func newAggrGroup(ctx context.Context, labels model.LabelSet, opts *RouteOpts) *aggrGroup {
	ag := &aggrGroup{
		labels: labels,
		opts:   opts,
		alerts: map[model.Fingerprint]*types.Alert{},
	}
	ag.ctx, ag.cancel = context.WithCancel(ctx)

	ag.log = log.With("aggrGroup", ag)

	// Set an initial one-time wait before flushing
	// the first batch of notifications.
	ag.next = time.NewTimer(ag.opts.GroupWait)

	return ag
}
Example #18
0
func (sl *scrapeLoop) append(samples model.Samples) {
	numOutOfOrder := 0

	for _, s := range samples {
		if err := sl.appender.Append(s); err != nil {
			if err == local.ErrOutOfOrderSample {
				numOutOfOrder++
			} else {
				log.Warnf("Error inserting sample: %s", err)
			}
		}
	}
	if numOutOfOrder > 0 {
		log.With("numDropped", numOutOfOrder).Warn("Error on ingesting out-of-order samples")
	}
}
Example #19
0
// sendAll sends the alerts to all configured Alertmanagers concurrently.
// It returns true if the alerts could be sent successfully to at least one Alertmanager.
func (n *Notifier) sendAll(alerts ...*model.Alert) bool {
	begin := time.Now()

	b, err := json.Marshal(alerts)
	if err != nil {
		log.Errorf("Encoding alerts failed: %s", err)
		return false
	}

	n.mtx.RLock()
	amSets := n.alertmanagers
	n.mtx.RUnlock()

	var (
		wg         sync.WaitGroup
		numSuccess uint64
	)
	for _, ams := range amSets {
		ams.mtx.RLock()

		for _, am := range ams.ams {
			wg.Add(1)

			ctx, cancel := context.WithTimeout(n.ctx, ams.cfg.Timeout)
			defer cancel()

			go func(am alertmanager) {
				u := am.url()

				if err := n.sendOne(ctx, ams.client, u, b); err != nil {
					log.With("alertmanager", u).With("count", len(alerts)).Errorf("Error sending alerts: %s", err)
					n.errors.WithLabelValues(u).Inc()
				} else {
					atomic.AddUint64(&numSuccess, 1)
				}
				n.latency.WithLabelValues(u).Observe(time.Since(begin).Seconds())
				n.sent.WithLabelValues(u).Add(float64(len(alerts)))

				wg.Done()
			}(am)
		}
		ams.mtx.RUnlock()
	}
	wg.Wait()

	return numSuccess > 0
}
Example #20
0
// NewDispatcher returns a new Dispatcher.
func NewDispatcher(
	ap provider.Alerts,
	r *Route,
	s notify.Stage,
	mk types.Marker,
	to func(time.Duration) time.Duration,
) *Dispatcher {
	disp := &Dispatcher{
		alerts:  ap,
		stage:   s,
		route:   r,
		marker:  mk,
		timeout: to,
		log:     log.With("component", "dispatcher"),
	}
	return disp
}
Example #21
0
func (s *memorySeriesStorage) handleQuarantine() {
	for {
		select {
		case req := <-s.quarantineRequests:
			s.purgeSeries(req.fp, req.metric, req.reason)
			log.
				With("fingerprint", req.fp).
				With("metric", req.metric).
				With("reason", req.reason).
				Warn("Series quarantined.")
		case <-s.quarantineStopping:
			log.Info("Series quarantining stopped.")
			close(s.quarantineStopped)
			return
		}
	}

}
Example #22
0
func serveAsset(w http.ResponseWriter, req *http.Request, fp string) {
	info, err := ui.AssetInfo(fp)
	if err != nil {
		log.Warn("Could not get file: ", err)
		w.WriteHeader(http.StatusNotFound)
		return
	}
	file, err := ui.Asset(fp)
	if err != nil {
		if err != io.EOF {
			log.With("file", fp).Warn("Could not get file: ", err)
		}
		w.WriteHeader(http.StatusNotFound)
		return
	}

	http.ServeContent(w, req, info.Name(), info.ModTime(), bytes.NewReader(file))
}
Example #23
0
// Sync converts target groups into actual scrape targets and synchronizes
// the currently running scraper with the resulting set.
func (sp *scrapePool) Sync(tgs []*config.TargetGroup) {
	start := time.Now()

	var all []*Target
	for _, tg := range tgs {
		targets, err := targetsFromGroup(tg, sp.config)
		if err != nil {
			log.With("err", err).Error("creating targets failed")
			continue
		}
		all = append(all, targets...)
	}
	sp.sync(all)

	targetSyncIntervalLength.WithLabelValues(sp.config.JobName).Observe(
		time.Since(start).Seconds(),
	)
	targetScrapePoolSyncsCounter.WithLabelValues(sp.config.JobName).Inc()
}
Example #24
0
// purgeSeries removes all traces of a series. If a non-nil quarantine reason is
// provided, the series file will not be deleted completely, but moved to the
// orphaned directory with the reason and the metric in a hint file. The
// provided metric might be nil if unknown.
func (s *memorySeriesStorage) purgeSeries(fp model.Fingerprint, m model.Metric, quarantineReason error) {
	s.fpLocker.Lock(fp)

	var (
		series *memorySeries
		ok     bool
	)

	if series, ok = s.fpToSeries.get(fp); ok {
		s.fpToSeries.del(fp)
		s.numSeries.Dec()
		m = series.metric

		// Adjust s.numChunksToPersist and numMemChunks down by
		// the number of chunks in this series that are not
		// persisted yet. Persisted chunks will be deducted from
		// numMemChunks upon eviction.
		numChunksNotYetPersisted := len(series.chunkDescs) - series.persistWatermark
		atomic.AddInt64(&numMemChunks, int64(-numChunksNotYetPersisted))
		if !series.headChunkClosed {
			// Head chunk wasn't counted as waiting for persistence yet.
			// (But it was counted as a chunk in memory.)
			numChunksNotYetPersisted--
		}
		s.incNumChunksToPersist(-numChunksNotYetPersisted)

	} else {
		s.persistence.purgeArchivedMetric(fp) // Ignoring error. There is nothing we can do.
	}
	if m != nil {
		// If we know a metric now, unindex it in any case.
		// purgeArchivedMetric might have done so already, but we cannot
		// be sure. Unindexing in idempotent, though.
		s.persistence.unindexMetric(fp, m)
	}
	// Attempt to delete/quarantine the series file in any case.
	if quarantineReason == nil {
		// No reason stated, simply delete the file.
		if _, err := s.persistence.deleteSeriesFile(fp); err != nil {
			log.
				With("fingerprint", fp).
				With("metric", m).
				With("error", err).
				Error("Error deleting series file.")
		}
		s.seriesOps.WithLabelValues(requestedPurge).Inc()
	} else {
		if err := s.persistence.quarantineSeriesFile(fp, quarantineReason, m); err == nil {
			s.seriesOps.WithLabelValues(completedQurantine).Inc()
		} else {
			s.seriesOps.WithLabelValues(failedQuarantine).Inc()
			log.
				With("fingerprint", fp).
				With("metric", m).
				With("reason", quarantineReason).
				With("error", err).
				Error("Error quarantining series file.")
		}
	}

	s.fpLocker.Unlock(fp)
}
Example #25
0
func main() {
	flag.Parse()

	if *showVersion {
		fmt.Fprintln(os.Stdout, version.Print("alertmanager"))
		os.Exit(0)
	}

	log.Infoln("Starting alertmanager", version.Info())
	log.Infoln("Build context", version.BuildContext())

	err := os.MkdirAll(*dataDir, 0777)
	if err != nil {
		log.Fatal(err)
	}

	marker := types.NewMarker()

	alerts, err := boltmem.NewAlerts(*dataDir)
	if err != nil {
		log.Fatal(err)
	}
	defer alerts.Close()

	notifies, err := boltmem.NewNotificationInfo(*dataDir)
	if err != nil {
		log.Fatal(err)
	}
	defer notifies.Close()

	silences, err := boltmem.NewSilences(*dataDir, marker)
	if err != nil {
		log.Fatal(err)
	}
	defer silences.Close()

	var (
		inhibitor *Inhibitor
		tmpl      *template.Template
		disp      *Dispatcher
	)
	defer disp.Stop()

	api := NewAPI(alerts, silences, func() AlertOverview {
		return disp.Groups()
	})

	build := func(rcvs []*config.Receiver) notify.Notifier {
		var (
			router  = notify.Router{}
			fanouts = notify.Build(rcvs, tmpl)
		)
		for name, fo := range fanouts {
			for i, n := range fo {
				n = notify.Retry(n)
				n = notify.Log(n, log.With("step", "retry"))
				n = notify.Dedup(notifies, n)
				n = notify.Log(n, log.With("step", "dedup"))

				fo[i] = n
			}
			router[name] = fo
		}
		n := notify.Notifier(router)

		n = notify.Log(n, log.With("step", "route"))
		n = notify.Silence(silences, n, marker)
		n = notify.Log(n, log.With("step", "silence"))
		n = notify.Inhibit(inhibitor, n, marker)
		n = notify.Log(n, log.With("step", "inhibit"))

		return n
	}

	amURL, err := extURL(*externalURL)
	if err != nil {
		log.Fatal(err)
	}

	reload := func() (err error) {
		log.With("file", *configFile).Infof("Loading configuration file")
		defer func() {
			if err != nil {
				log.With("file", *configFile).Errorf("Loading configuration file failed: %s", err)
				configSuccess.Set(0)
			} else {
				configSuccess.Set(1)
				configSuccessTime.Set(float64(time.Now().Unix()))
			}
		}()

		conf, err := config.LoadFile(*configFile)
		if err != nil {
			return err
		}

		api.Update(conf.String(), time.Duration(conf.Global.ResolveTimeout))

		tmpl, err = template.FromGlobs(conf.Templates...)
		if err != nil {
			return err
		}
		tmpl.ExternalURL = amURL

		inhibitor.Stop()
		disp.Stop()

		inhibitor = NewInhibitor(alerts, conf.InhibitRules, marker)
		disp = NewDispatcher(alerts, NewRoute(conf.Route, nil), build(conf.Receivers), marker)

		go disp.Run()
		go inhibitor.Run()

		return nil
	}

	if err := reload(); err != nil {
		os.Exit(1)
	}

	router := route.New()

	webReload := make(chan struct{})
	RegisterWeb(router.WithPrefix(amURL.Path), webReload)
	api.Register(router.WithPrefix(path.Join(amURL.Path, "/api")))

	log.Infoln("Listening on", *listenAddress)
	go listen(router)

	var (
		hup      = make(chan os.Signal)
		hupReady = make(chan bool)
		term     = make(chan os.Signal)
	)
	signal.Notify(hup, syscall.SIGHUP)
	signal.Notify(term, os.Interrupt, syscall.SIGTERM)

	go func() {
		<-hupReady
		for {
			select {
			case <-hup:
			case <-webReload:
			}
			reload()
		}
	}()

	// Wait for reload or termination signals.
	close(hupReady) // Unblock SIGHUP handler.

	<-term

	log.Infoln("Received SIGTERM, exiting gracefully...")
}
Example #26
0
// eval runs a single evaluation cycle in which all rules are evaluated in parallel.
// In the future a single group will be evaluated sequentially to properly handle
// rule dependency.
func (g *Group) eval() {
	var (
		now = model.Now()
		wg  sync.WaitGroup
	)

	for _, rule := range g.rules {
		rtyp := string(typeForRule(rule))

		wg.Add(1)
		// BUG(julius): Look at fixing thundering herd.
		go func(rule Rule) {
			defer wg.Done()

			defer func(t time.Time) {
				evalDuration.WithLabelValues(rtyp).Observe(time.Since(t).Seconds())
			}(time.Now())

			evalTotal.WithLabelValues(rtyp).Inc()

			vector, err := rule.eval(g.opts.Context, now, g.opts.QueryEngine, g.opts.ExternalURL.Path)
			if err != nil {
				// Canceled queries are intentional termination of queries. This normally
				// happens on shutdown and thus we skip logging of any errors here.
				if _, ok := err.(promql.ErrQueryCanceled); !ok {
					log.Warnf("Error while evaluating rule %q: %s", rule, err)
				}
				evalFailures.WithLabelValues(rtyp).Inc()
				return
			}

			if ar, ok := rule.(*AlertingRule); ok {
				g.sendAlerts(ar, now)
			}
			var (
				numOutOfOrder = 0
				numDuplicates = 0
			)
			for _, s := range vector {
				if err := g.opts.SampleAppender.Append(s); err != nil {
					switch err {
					case local.ErrOutOfOrderSample:
						numOutOfOrder++
						log.With("sample", s).With("error", err).Debug("Rule evaluation result discarded")
					case local.ErrDuplicateSampleForTimestamp:
						numDuplicates++
						log.With("sample", s).With("error", err).Debug("Rule evaluation result discarded")
					default:
						log.With("sample", s).With("error", err).Warn("Rule evaluation result discarded")
					}
				}
			}
			if numOutOfOrder > 0 {
				log.With("numDropped", numOutOfOrder).Warn("Error on ingesting out-of-order result from rule evaluation")
			}
			if numDuplicates > 0 {
				log.With("numDropped", numDuplicates).Warn("Error on ingesting results from rule evaluation with different value but same timestamp")
			}
		}(rule)
	}
	wg.Wait()
}
Example #27
0
// Notify implements the Notifier interface.
func (n *OpsGenie) Notify(ctx context.Context, as ...*types.Alert) error {
	key, ok := GroupKey(ctx)
	if !ok {
		return fmt.Errorf("group key missing")
	}
	data := n.tmpl.Data(receiver(ctx), groupLabels(ctx), as...)

	log.With("incident", key).Debugln("notifying OpsGenie")

	var err error
	tmpl := tmplText(n.tmpl, data, &err)

	details := make(map[string]string, len(n.conf.Details))
	for k, v := range n.conf.Details {
		details[k] = tmpl(v)
	}

	var (
		msg    interface{}
		apiURL string

		apiMsg = opsGenieMessage{
			APIKey: string(n.conf.APIKey),
			Alias:  key,
		}
		alerts = types.Alerts(as...)
	)
	switch alerts.Status() {
	case model.AlertResolved:
		apiURL = n.conf.APIHost + "v1/json/alert/close"
		msg = &opsGenieCloseMessage{&apiMsg}
	default:
		apiURL = n.conf.APIHost + "v1/json/alert"
		msg = &opsGenieCreateMessage{
			opsGenieMessage: &apiMsg,
			Message:         tmpl(n.conf.Description),
			Details:         details,
			Source:          tmpl(n.conf.Source),
			Teams:           tmpl(n.conf.Teams),
			Tags:            tmpl(n.conf.Tags),
		}
	}
	if err != nil {
		return fmt.Errorf("templating error: %s", err)
	}

	var buf bytes.Buffer
	if err := json.NewEncoder(&buf).Encode(msg); err != nil {
		return err
	}

	resp, err := ctxhttp.Post(ctx, http.DefaultClient, apiURL, contentTypeJSON, &buf)
	if err != nil {
		return err
	}
	defer resp.Body.Close()

	if resp.StatusCode == 400 && alerts.Status() == model.AlertResolved {
		body, _ := ioutil.ReadAll(resp.Body)

		var responseMessage opsGenieErrorResponse
		if err := json.Unmarshal(body, &responseMessage); err != nil {
			return fmt.Errorf("could not parse error response %q", body)
		}
		const alreadyClosedError = 5
		if responseMessage.Code == alreadyClosedError {
			return nil
		}
		return fmt.Errorf("error when closing alert: code %d, error %q",
			responseMessage.Code, responseMessage.Error)
	} else if resp.StatusCode/100 != 2 {
		body, _ := ioutil.ReadAll(resp.Body)
		log.With("incident", key).Debugf("unexpected OpsGenie response from %s (POSTed %s), %s: %s",
			apiURL, msg, resp.Status, body)
		return fmt.Errorf("unexpected status code %v", resp.StatusCode)
	}
	return nil
}
func (s *BasicService) log() log.Logger {
	if s.host != nil {
		return s.host.log()
	}
	return log.With("logger_note", "no host")
}
Example #29
0
func (ts *targetSet) runProviders(ctx context.Context, providers map[string]TargetProvider) {
	// Lock for the entire time. This may mean up to 5 seconds until the full initial set
	// is retrieved and applied.
	// We could release earlier with some tweaks, but this is easier to reason about.
	ts.mtx.Lock()
	defer ts.mtx.Unlock()

	var wg sync.WaitGroup

	if ts.cancelProviders != nil {
		ts.cancelProviders()
	}
	ctx, ts.cancelProviders = context.WithCancel(ctx)

	// (Re-)create a fresh tgroups map to not keep stale targets around. We
	// will retrieve all targets below anyway, so cleaning up everything is
	// safe and doesn't inflict any additional cost.
	ts.tgroups = map[string][]*Target{}

	for name, prov := range providers {
		wg.Add(1)

		updates := make(chan []*config.TargetGroup)

		go func(name string, prov TargetProvider) {
			select {
			case <-ctx.Done():
			case initial, ok := <-updates:
				// Handle the case that a target provider exits and closes the channel
				// before the context is done.
				if !ok {
					break
				}
				// First set of all targets the provider knows.
				for _, tgroup := range initial {
					if tgroup == nil {
						continue
					}
					targets, err := targetsFromGroup(tgroup, ts.config)
					if err != nil {
						log.With("target_group", tgroup).Errorf("Target update failed: %s", err)
						continue
					}
					ts.tgroups[name+"/"+tgroup.Source] = targets
				}
			case <-time.After(5 * time.Second):
				// Initial set didn't arrive. Act as if it was empty
				// and wait for updates later on.
			}

			wg.Done()

			// Start listening for further updates.
			for {
				select {
				case <-ctx.Done():
					return
				case tgs, ok := <-updates:
					// Handle the case that a target provider exits and closes the channel
					// before the context is done.
					if !ok {
						return
					}
					for _, tg := range tgs {
						if err := ts.update(name, tg); err != nil {
							log.With("target_group", tg).Errorf("Target update failed: %s", err)
						}
					}
				}
			}
		}(name, prov)

		go prov.Run(ctx, updates)
	}

	// We wait for a full initial set of target groups before releasing the mutex
	// to ensure the initial sync is complete and there are no races with subsequent updates.
	wg.Wait()
	// Just signal that there are initial sets to sync now. Actual syncing must only
	// happen in the runScraping loop.
	select {
	case ts.syncCh <- struct{}{}:
	default:
	}
}
Example #30
0
func (h *Handler) federation(w http.ResponseWriter, req *http.Request) {
	h.mtx.RLock()
	defer h.mtx.RUnlock()

	req.ParseForm()

	var matcherSets []metric.LabelMatchers
	for _, s := range req.Form["match[]"] {
		matchers, err := promql.ParseMetricSelector(s)
		if err != nil {
			http.Error(w, err.Error(), http.StatusBadRequest)
			return
		}
		matcherSets = append(matcherSets, matchers)
	}

	var (
		minTimestamp = h.now().Add(-promql.StalenessDelta)
		format       = expfmt.Negotiate(req.Header)
		enc          = expfmt.NewEncoder(w, format)
	)
	w.Header().Set("Content-Type", string(format))

	q, err := h.storage.Querier()
	if err != nil {
		federationErrors.Inc()
		http.Error(w, err.Error(), http.StatusInternalServerError)
		return
	}
	defer q.Close()

	vector, err := q.LastSampleForLabelMatchers(h.context, minTimestamp, matcherSets...)
	if err != nil {
		federationErrors.Inc()
		http.Error(w, err.Error(), http.StatusInternalServerError)
		return
	}
	sort.Sort(byName(vector))

	var (
		lastMetricName model.LabelValue
		protMetricFam  *dto.MetricFamily
	)
	for _, s := range vector {
		nameSeen := false
		globalUsed := map[model.LabelName]struct{}{}
		protMetric := &dto.Metric{
			Untyped: &dto.Untyped{},
		}

		for ln, lv := range s.Metric {
			if lv == "" {
				// No value means unset. Never consider those labels.
				// This is also important to protect against nameless metrics.
				continue
			}
			if ln == model.MetricNameLabel {
				nameSeen = true
				if lv == lastMetricName {
					// We already have the name in the current MetricFamily,
					// and we ignore nameless metrics.
					continue
				}
				// Need to start a new MetricFamily. Ship off the old one (if any) before
				// creating the new one.
				if protMetricFam != nil {
					if err := enc.Encode(protMetricFam); err != nil {
						federationErrors.Inc()
						log.With("err", err).Error("federation failed")
						return
					}
				}
				protMetricFam = &dto.MetricFamily{
					Type: dto.MetricType_UNTYPED.Enum(),
					Name: proto.String(string(lv)),
				}
				lastMetricName = lv
				continue
			}
			protMetric.Label = append(protMetric.Label, &dto.LabelPair{
				Name:  proto.String(string(ln)),
				Value: proto.String(string(lv)),
			})
			if _, ok := h.externalLabels[ln]; ok {
				globalUsed[ln] = struct{}{}
			}
		}
		if !nameSeen {
			log.With("metric", s.Metric).Warn("Ignoring nameless metric during federation.")
			continue
		}
		// Attach global labels if they do not exist yet.
		for ln, lv := range h.externalLabels {
			if _, ok := globalUsed[ln]; !ok {
				protMetric.Label = append(protMetric.Label, &dto.LabelPair{
					Name:  proto.String(string(ln)),
					Value: proto.String(string(lv)),
				})
			}
		}

		protMetric.TimestampMs = proto.Int64(int64(s.Timestamp))
		protMetric.Untyped.Value = proto.Float64(float64(s.Value))

		protMetricFam.Metric = append(protMetricFam.Metric, protMetric)
	}
	// Still have to ship off the last MetricFamily, if any.
	if protMetricFam != nil {
		if err := enc.Encode(protMetricFam); err != nil {
			federationErrors.Inc()
			log.With("err", err).Error("federation failed")
		}
	}
}