// processHistograms derives rich metrics from histograms, currently // percentiles, sum, count, and mean. func (ms *MetricSystem) processHistograms(name string, valuesToCounts map[int16]*uint64) map[string]float64 { output := make(map[string]float64) totalSum := float64(0) totalCount := uint64(0) proportions := make([]proportion, 0, len(valuesToCounts)) for compressedValue, count := range valuesToCounts { value := decompress(compressedValue) totalSum += value * float64(*count) totalCount += *count proportions = append(proportions, proportion{Value: value, Count: *count}) } sumName := fmt.Sprintf("%s_sum", name) countName := fmt.Sprintf("%s_count", name) avgName := fmt.Sprintf("%s_avg", name) // increment interval sum and count output[countName] = float64(totalCount) output[sumName] = totalSum output[avgName] = totalSum / float64(totalCount) // increment aggregate sum and count ms.histogramCountMu.RLock() _, present := ms.histogramCountStore[sumName] if !present { ms.histogramCountMu.RUnlock() ms.histogramCountMu.Lock() _, syncPresent := ms.histogramCountStore[sumName] if !syncPresent { var x uint64 ms.histogramCountStore[sumName] = &x var z uint64 ms.histogramCountStore[countName] = &z } ms.histogramCountMu.Unlock() ms.histogramCountMu.RLock() } atomic.AddUint64(ms.histogramCountStore[sumName], uint64(totalSum)) atomic.AddUint64(ms.histogramCountStore[countName], totalCount) ms.histogramCountMu.RUnlock() for label, p := range ms.percentiles { value, err := percentile(totalCount, proportions, p) if err != nil { glog.Errorf("unable to calculate percentile: %s", err) } else { output[fmt.Sprintf(label, name)] = value } } return output }
// reaper wakes up every <interval> seconds, // collects and processes metrics, and pushes // them to the corresponding subscribing channels. func (ms *MetricSystem) reaper() { ms.reaping = true // create goroutine pool to handle multiple processing tasks at once processChan := make(chan func(), 16) for i := 0; i < int(math.Max(float64(runtime.NumCPU()/4), 4)); i++ { go func() { for { c, ok := <-processChan if !ok { return } c() } }() } // begin reaper main loop for { // sleep until the next interval, or die if shutdownChan is closed tts := ms.interval.Nanoseconds() - (time.Now().UnixNano() % ms.interval.Nanoseconds()) select { case <-time.After(time.Duration(tts)): case <-ms.shutdownChan: ms.reaping = false close(processChan) return } rawMetrics := ms.collectRawMetrics() ms.updateSubscribers() // broadcast raw metrics for subscriber := range ms.rawSubscribers { // new subscribers get all counters, otherwise just the new diffs select { case subscriber <- rawMetrics: delete(ms.rawBadSubscribers, subscriber) default: ms.rawBadSubscribers[subscriber]++ glog.Error("a raw subscriber has allowed their channel to fill up. ", "dropping their metrics on the floor rather than blocking.") if ms.rawBadSubscribers[subscriber] >= 2 { glog.Error("this raw subscriber has caused dropped metrics at ", "least 3 times in a row. closing the channel.") delete(ms.rawSubscribers, subscriber) close(subscriber) } } } // Perform the rest in another goroutine since processing is not // gauranteed to complete before the interval is up. sendProcessed := func() { // this is potentially expensive if there is a massive number of metrics processedMetrics := ms.processMetrics(rawMetrics) // add aggregate mean for name := range rawMetrics.Histograms { ms.histogramCountMu.RLock() aggCountPtr, countPresent := ms.histogramCountStore[fmt.Sprintf("%s_count", name)] aggCount := atomic.LoadUint64(aggCountPtr) aggSumPtr, sumPresent := ms.histogramCountStore[fmt.Sprintf("%s_sum", name)] aggSum := atomic.LoadUint64(aggSumPtr) ms.histogramCountMu.RUnlock() if countPresent && sumPresent && aggCount > 0 { processedMetrics.Metrics[fmt.Sprintf("%s_agg_avg", name)] = float64(aggSum / aggCount) processedMetrics.Metrics[fmt.Sprintf("%s_agg_count", name)] = float64(aggCount) processedMetrics.Metrics[fmt.Sprintf("%s_agg_sum", name)] = float64(aggSum) } } // broadcast processed metrics ms.subscribersMu.Lock() for subscriber := range ms.processedSubscribers { select { case subscriber <- processedMetrics: delete(ms.processedBadSubscribers, subscriber) default: ms.processedBadSubscribers[subscriber]++ glog.Error("a subscriber has allowed their channel to fill up. ", "dropping their metrics on the floor rather than blocking.") if ms.processedBadSubscribers[subscriber] >= 2 { glog.Error("this subscriber has caused dropped metrics at ", "least 3 times in a row. closing the channel.") delete(ms.processedSubscribers, subscriber) close(subscriber) } } } ms.subscribersMu.Unlock() } select { case processChan <- sendProcessed: default: // processChan has filled up, this metric load is not sustainable glog.Errorf("processing of metrics is taking longer than this node can "+ "handle. dropping this entire interval of %s metrics on the "+ "floor rather than blocking the reaper.", rawMetrics.Time) } } // end main reaper loop }