Exemplo n.º 1
0
// processHistograms derives rich metrics from histograms, currently
// percentiles, sum, count, and mean.
func (ms *MetricSystem) processHistograms(name string,
	valuesToCounts map[int16]*uint64) map[string]float64 {
	output := make(map[string]float64)
	totalSum := float64(0)
	totalCount := uint64(0)
	proportions := make([]proportion, 0, len(valuesToCounts))
	for compressedValue, count := range valuesToCounts {
		value := decompress(compressedValue)
		totalSum += value * float64(*count)
		totalCount += *count
		proportions = append(proportions, proportion{Value: value, Count: *count})
	}

	sumName := fmt.Sprintf("%s_sum", name)
	countName := fmt.Sprintf("%s_count", name)
	avgName := fmt.Sprintf("%s_avg", name)

	// increment interval sum and count
	output[countName] = float64(totalCount)
	output[sumName] = totalSum
	output[avgName] = totalSum / float64(totalCount)

	// increment aggregate sum and count
	ms.histogramCountMu.RLock()
	_, present := ms.histogramCountStore[sumName]
	if !present {
		ms.histogramCountMu.RUnlock()
		ms.histogramCountMu.Lock()
		_, syncPresent := ms.histogramCountStore[sumName]
		if !syncPresent {
			var x uint64
			ms.histogramCountStore[sumName] = &x
			var z uint64
			ms.histogramCountStore[countName] = &z
		}
		ms.histogramCountMu.Unlock()
		ms.histogramCountMu.RLock()
	}
	atomic.AddUint64(ms.histogramCountStore[sumName], uint64(totalSum))
	atomic.AddUint64(ms.histogramCountStore[countName], totalCount)
	ms.histogramCountMu.RUnlock()

	for label, p := range ms.percentiles {
		value, err := percentile(totalCount, proportions, p)
		if err != nil {
			glog.Errorf("unable to calculate percentile: %s", err)
		} else {
			output[fmt.Sprintf(label, name)] = value
		}
	}
	return output
}
Exemplo n.º 2
0
// reaper wakes up every <interval> seconds,
// collects and processes metrics, and pushes
// them to the corresponding subscribing channels.
func (ms *MetricSystem) reaper() {
	ms.reaping = true

	// create goroutine pool to handle multiple processing tasks at once
	processChan := make(chan func(), 16)
	for i := 0; i < int(math.Max(float64(runtime.NumCPU()/4), 4)); i++ {
		go func() {
			for {
				c, ok := <-processChan
				if !ok {
					return
				}
				c()
			}
		}()
	}

	// begin reaper main loop
	for {
		// sleep until the next interval, or die if shutdownChan is closed
		tts := ms.interval.Nanoseconds() -
			(time.Now().UnixNano() % ms.interval.Nanoseconds())
		select {
		case <-time.After(time.Duration(tts)):
		case <-ms.shutdownChan:
			ms.reaping = false
			close(processChan)
			return
		}

		rawMetrics := ms.collectRawMetrics()

		ms.updateSubscribers()

		// broadcast raw metrics
		for subscriber := range ms.rawSubscribers {
			// new subscribers get all counters, otherwise just the new diffs
			select {
			case subscriber <- rawMetrics:
				delete(ms.rawBadSubscribers, subscriber)
			default:
				ms.rawBadSubscribers[subscriber]++
				glog.Error("a raw subscriber has allowed their channel to fill up. ",
					"dropping their metrics on the floor rather than blocking.")
				if ms.rawBadSubscribers[subscriber] >= 2 {
					glog.Error("this raw subscriber has caused dropped metrics at ",
						"least 3 times in a row.  closing the channel.")
					delete(ms.rawSubscribers, subscriber)
					close(subscriber)
				}
			}
		}

		// Perform the rest in another goroutine since processing is not
		// gauranteed to complete before the interval is up.
		sendProcessed := func() {
			// this is potentially expensive if there is a massive number of metrics
			processedMetrics := ms.processMetrics(rawMetrics)

			// add aggregate mean
			for name := range rawMetrics.Histograms {
				ms.histogramCountMu.RLock()
				aggCountPtr, countPresent :=
					ms.histogramCountStore[fmt.Sprintf("%s_count", name)]
				aggCount := atomic.LoadUint64(aggCountPtr)
				aggSumPtr, sumPresent :=
					ms.histogramCountStore[fmt.Sprintf("%s_sum", name)]
				aggSum := atomic.LoadUint64(aggSumPtr)
				ms.histogramCountMu.RUnlock()

				if countPresent && sumPresent && aggCount > 0 {
					processedMetrics.Metrics[fmt.Sprintf("%s_agg_avg", name)] =
						float64(aggSum / aggCount)
					processedMetrics.Metrics[fmt.Sprintf("%s_agg_count", name)] =
						float64(aggCount)
					processedMetrics.Metrics[fmt.Sprintf("%s_agg_sum", name)] =
						float64(aggSum)
				}
			}

			// broadcast processed metrics
			ms.subscribersMu.Lock()
			for subscriber := range ms.processedSubscribers {
				select {
				case subscriber <- processedMetrics:
					delete(ms.processedBadSubscribers, subscriber)
				default:
					ms.processedBadSubscribers[subscriber]++
					glog.Error("a subscriber has allowed their channel to fill up. ",
						"dropping their metrics on the floor rather than blocking.")
					if ms.processedBadSubscribers[subscriber] >= 2 {
						glog.Error("this subscriber has caused dropped metrics at ",
							"least 3 times in a row.  closing the channel.")
						delete(ms.processedSubscribers, subscriber)
						close(subscriber)
					}
				}
			}
			ms.subscribersMu.Unlock()
		}
		select {
		case processChan <- sendProcessed:
		default:
			// processChan has filled up, this metric load is not sustainable
			glog.Errorf("processing of metrics is taking longer than this node can "+
				"handle.  dropping this entire interval of %s metrics on the "+
				"floor rather than blocking the reaper.", rawMetrics.Time)
		}
	} // end main reaper loop
}