func NewGaugeDiff(name string, r metrics.Registry) *GaugeDiff { return &GaugeDiff{ Delta: metrics.NewRegisteredGauge(name, r), Absolute: metrics.NewRegisteredGauge(name+"-absolute", metrics.NewRegistry()), Previous: metrics.NewRegisteredGauge(name+"-previous", metrics.NewRegistry()), } }
func ExtraNewMetricRAM(r metrics.Registry, extra RAMUpdater) *MetricRAM { return &MetricRAM{ Free: metrics.NewRegisteredGauge("memory.memory-free", r), Total: metrics.NewRegisteredGauge("memory.memory-total", metrics.NewRegistry()), Extra: extra, } }
func (ir *IndexRegistry) GetOrRegisterPrivateDF(fs sigar.FileSystem) operating.MetricDF { ir.PrivateMutex.Lock() defer ir.PrivateMutex.Unlock() if fs.DirName == "/" { fs.DevName = "root" } else { fs.DevName = strings.Replace(strings.TrimPrefix(fs.DevName, "/dev/"), "/", "-", -1) } if metric := ir.PrivateDFRegistry.Get(fs.DevName); metric != nil { return metric.(operating.MetricDF) } label := func(tail string) string { return fmt.Sprintf("df-%s.df_complex-%s", fs.DevName, tail) } r, unusedr := ir.Registry, metrics.NewRegistry() i := operating.MetricDF{ DF: &operating.DF{ DevName: &operating.StandardMetricString{}, // unregistered DirName: &operating.StandardMetricString{}, // unregistered Free: metrics.NewRegisteredGaugeFloat64(label("free"), r), Reserved: metrics.NewRegisteredGaugeFloat64(label("reserved"), r), Total: metrics.NewRegisteredGauge(label("total"), unusedr), Used: metrics.NewRegisteredGaugeFloat64(label("used"), r), Avail: metrics.NewRegisteredGauge(label("avail"), unusedr), UsePercent: metrics.NewRegisteredGaugeFloat64(label("usepercent"), unusedr), Inodes: metrics.NewRegisteredGauge(label("inodes"), unusedr), Iused: metrics.NewRegisteredGauge(label("iused"), unusedr), Ifree: metrics.NewRegisteredGauge(label("ifree"), unusedr), IusePercent: metrics.NewRegisteredGaugeFloat64(label("iusepercent"), unusedr), }, } ir.PrivateDFRegistry.Register(fs.DevName, i) // error is ignored // errs when the type is not derived from (go-)metrics types return i }
func NewMetricRAM(r metrics.Registry) *operating.MetricRAM { return operating.ExtraNewMetricRAM(r, &ExtraMetricRAM{ Used: metrics.NewRegisteredGauge("memory.memory-used", r), Buffered: metrics.NewRegisteredGauge("memory.memory-buffered", r), Cached: metrics.NewRegisteredGauge("memory.memory-cached", r), }) }
// Start calculating and reporting statistics on the repo and tiles. // // We presume the git.Update(true) is called somewhere else, usually this is done // in the trace/db.Builder, so the repo is always as good as the loaded tiles. func Start(nanoTileStore *db.Builder, git *gitinfo.GitInfo) { coverage := metrics.NewRegisteredGaugeFloat64("stats.tests.bench_runs_per_changelist", metrics.DefaultRegistry) skpLatency := metrics.NewRegisteredTimer("stats.skp.update_latency", metrics.DefaultRegistry) commits := metrics.NewRegisteredGauge("stats.commits.total", metrics.DefaultRegistry) go func() { for _ = range time.Tick(2 * time.Minute) { tile := nanoTileStore.GetTile() numCommits := tile.LastCommitIndex() + 1 numTraces := len(tile.Traces) total := 0 for _, tr := range tile.Traces { for i := 0; i < numCommits; i++ { if !tr.IsMissing(i) { total += 1 } } } cov := float64(total) / float64(numCommits*numTraces) glog.Info("Coverage: ", cov) coverage.Update(cov) last, err := git.LastSkpCommit() if err != nil { glog.Warning("Failed to read last SKP commit: %s", err) continue } skpLatency.Update(time.Since(last)) commits.Update(int64(git.NumCommits())) } }() }
// monitorIssueTracker reads the counts for all the types of issues in the skia // issue tracker (code.google.com/p/skia) and stuffs the counts into Graphite. func monitorIssueTracker() { c := &http.Client{ Transport: &http.Transport{ Dial: dialTimeout, }, } if *useMetadata { *apikey = metadata.Must(metadata.ProjectGet(metadata.APIKEY)) } // Create a new metrics registry for the issue tracker metrics. addr, err := net.ResolveTCPAddr("tcp", *graphiteServer) if err != nil { glog.Fatalln("Failed to resolve the Graphite server: ", err) } issueRegistry := metrics.NewRegistry() go graphite.Graphite(issueRegistry, common.SAMPLE_PERIOD, "issues", addr) // IssueStatus has all the info we need to capture and record a single issue status. I.e. capture // the count of all issues with a status of "New". type IssueStatus struct { Name string Metric metrics.Gauge URL string } allIssueStatusLabels := []string{ "New", "Accepted", "Unconfirmed", "Started", "Fixed", "Verified", "Invalid", "WontFix", "Done", "Available", "Assigned", } issueStatus := []*IssueStatus{} for _, issueName := range allIssueStatusLabels { issueStatus = append(issueStatus, &IssueStatus{ Name: issueName, Metric: metrics.NewRegisteredGauge(strings.ToLower(issueName), issueRegistry), URL: "https://www.googleapis.com/projecthosting/v2/projects/skia/issues?fields=totalResults&key=" + *apikey + "&status=" + issueName, }) } liveness := imetrics.NewLiveness("issue-tracker") for _ = range time.Tick(ISSUE_TRACKER_PERIOD) { for _, issue := range issueStatus { resp, err := c.Get(issue.URL) jsonResp := map[string]int64{} dec := json.NewDecoder(resp.Body) if err := dec.Decode(&jsonResp); err != nil { glog.Warningf("Failed to decode JSON response: %s", err) util.Close(resp.Body) continue } issue.Metric.Update(jsonResp["totalResults"]) glog.Infof("Num Issues: %s - %d", issue.Name, jsonResp["totalResults"]) if err == nil && resp.Body != nil { util.Close(resp.Body) } } liveness.Update() } }
func newConsumerMetrics(consumerName, prefix string) *ConsumerMetrics { kafkaMetrics := &ConsumerMetrics{ registry: metrics.DefaultRegistry, } // Ensure prefix ends with a dot (.) so it plays nice with statsd/graphite prefix = strings.Trim(prefix, " ") if prefix != "" && prefix[len(prefix)-1:] != "." { prefix += "." } kafkaMetrics.consumerName = consumerName kafkaMetrics.prefix = prefix kafkaMetrics.fetchersIdleTimer = metrics.NewRegisteredTimer(fmt.Sprintf("%sFetchersIdleTime-%s", prefix, consumerName), kafkaMetrics.registry) kafkaMetrics.fetchDurationTimer = metrics.NewRegisteredTimer(fmt.Sprintf("%sFetchDuration-%s", prefix, consumerName), kafkaMetrics.registry) kafkaMetrics.numWorkerManagersGauge = metrics.NewRegisteredGauge(fmt.Sprintf("%sNumWorkerManagers-%s", prefix, consumerName), kafkaMetrics.registry) kafkaMetrics.activeWorkersCounter = metrics.NewRegisteredCounter(fmt.Sprintf("%sWMsActiveWorkers-%s", prefix, consumerName), kafkaMetrics.registry) kafkaMetrics.pendingWMsTasksCounter = metrics.NewRegisteredCounter(fmt.Sprintf("%sWMsPendingTasks-%s", prefix, consumerName), kafkaMetrics.registry) kafkaMetrics.taskTimeoutCounter = metrics.NewRegisteredCounter(fmt.Sprintf("%sTaskTimeouts-%s", prefix, consumerName), kafkaMetrics.registry) kafkaMetrics.wmsBatchDurationTimer = metrics.NewRegisteredTimer(fmt.Sprintf("%sWMsBatchDuration-%s", prefix, consumerName), kafkaMetrics.registry) kafkaMetrics.wmsIdleTimer = metrics.NewRegisteredTimer(fmt.Sprintf("%sWMsIdleTime-%s", prefix, consumerName), kafkaMetrics.registry) kafkaMetrics.numFetchedMessagesCounter = metrics.NewRegisteredCounter(fmt.Sprintf("%sFetchedMessages-%s", prefix, consumerName), kafkaMetrics.registry) kafkaMetrics.numConsumedMessagesCounter = metrics.NewRegisteredCounter(fmt.Sprintf("%sConsumedMessages-%s", prefix, consumerName), kafkaMetrics.registry) kafkaMetrics.numAcksCounter = metrics.NewRegisteredCounter(fmt.Sprintf("%sAcks-%s", prefix, consumerName), kafkaMetrics.registry) kafkaMetrics.topicPartitionLag = make(map[TopicAndPartition]metrics.Gauge) kafkaMetrics.reportingStopChannels = make([]chan struct{}, 0) return kafkaMetrics }
func main() { defer common.LogPanic() common.InitWithMetrics("probeserver", graphiteServer) client, err := auth.NewDefaultJWTServiceAccountClient("https://www.googleapis.com/auth/userinfo.email") if err != nil { glog.Fatalf("Failed to create client for talking to the issue tracker: %s", err) } go monitorIssueTracker(client) glog.Infoln("Looking for Graphite server.") addr, err := net.ResolveTCPAddr("tcp", *graphiteServer) if err != nil { glog.Fatalln("Failed to resolve the Graphite server: ", err) } glog.Infoln("Found Graphite server.") liveness := imetrics.NewLiveness("probes") // We have two sets of metrics, one for the probes and one for the probe // server itself. The server's metrics are handled by common.Init() probeRegistry := metrics.NewRegistry() go graphite.Graphite(probeRegistry, common.SAMPLE_PERIOD, *prefix, addr) // TODO(jcgregorio) Monitor config file and reload if it changes. cfg, err := readConfigFiles(*config) if err != nil { glog.Fatalln("Failed to read config file: ", err) } glog.Infoln("Successfully read config file.") // Register counters for each probe. for name, probe := range cfg { probe.failure = metrics.NewRegisteredGauge(name+".failure", probeRegistry) probe.latency = metrics.NewRegisteredGauge(name+".latency", probeRegistry) } // Create a client that uses our dialer with a timeout. c := &http.Client{ Transport: &http.Transport{ Dial: dialTimeout, }, } probeOneRound(cfg, c) for _ = range time.Tick(*runEvery) { probeOneRound(cfg, c) liveness.Update() } }
// monitorIssueTracker reads the counts for all the types of issues in the Skia // issue tracker (bugs.chromium.org/p/skia) and stuffs the counts into Graphite. func monitorIssueTracker(c *http.Client) { // Create a new metrics registry for the issue tracker metrics. addr, err := net.ResolveTCPAddr("tcp", *graphiteServer) if err != nil { glog.Fatalln("Failed to resolve the Graphite server: ", err) } issueRegistry := metrics.NewRegistry() go graphite.Graphite(issueRegistry, common.SAMPLE_PERIOD, "issues", addr) // IssueStatus has all the info we need to capture and record a single issue status. I.e. capture // the count of all issues with a status of "New". type IssueStatus struct { Name string Metric metrics.Gauge URL string } allIssueStatusLabels := []string{ "New", "Accepted", "Unconfirmed", "Started", "Fixed", "Verified", "Invalid", "WontFix", "Done", "Available", "Assigned", } issueStatus := []*IssueStatus{} for _, issueName := range allIssueStatusLabels { q := url.Values{} q.Set("fields", "totalResults") q.Set("status", issueName) issueStatus = append(issueStatus, &IssueStatus{ Name: issueName, Metric: metrics.NewRegisteredGauge(strings.ToLower(issueName), issueRegistry), URL: issues.MONORAIL_BASE_URL + "?" + q.Encode(), }) } liveness := imetrics.NewLiveness("issue-tracker") for _ = range time.Tick(ISSUE_TRACKER_PERIOD) { for _, issue := range issueStatus { resp, err := c.Get(issue.URL) if err != nil { glog.Errorf("Failed to retrieve response from %s: %s", issue.URL, err) continue } jsonResp := map[string]int64{} dec := json.NewDecoder(resp.Body) if err := dec.Decode(&jsonResp); err != nil { glog.Warningf("Failed to decode JSON response: %s", err) util.Close(resp.Body) continue } issue.Metric.Update(jsonResp["totalResults"]) glog.Infof("Num Issues: %s - %d", issue.Name, jsonResp["totalResults"]) if err == nil && resp.Body != nil { util.Close(resp.Body) } } liveness.Update() } }
func (this *ConsumerMetrics) topicAndPartitionLag(topic string, partition int32) metrics.Gauge { topicAndPartition := TopicAndPartition{Topic: topic, Partition: partition} lag, ok := this.topicPartitionLag[topicAndPartition] if !ok { inLock(&this.metricLock, func() { lag, ok = this.topicPartitionLag[topicAndPartition] if !ok { this.topicPartitionLag[topicAndPartition] = metrics.NewRegisteredGauge(fmt.Sprintf("%sLag-%s-%s", this.prefix, this.consumerName, &topicAndPartition), this.registry) lag = this.topicPartitionLag[topicAndPartition] } }) } return lag }
func newConsumerMetrics(consumerName string) *consumerMetrics { kafkaMetrics := &consumerMetrics{ registry: metrics.NewRegistry(), } kafkaMetrics.fetchersIdleTimer = metrics.NewRegisteredTimer(fmt.Sprintf("FetchersIdleTime-%s", consumerName), kafkaMetrics.registry) kafkaMetrics.fetchDurationTimer = metrics.NewRegisteredTimer(fmt.Sprintf("FetchDuration-%s", consumerName), kafkaMetrics.registry) kafkaMetrics.numWorkerManagersGauge = metrics.NewRegisteredGauge(fmt.Sprintf("NumWorkerManagers-%s", consumerName), kafkaMetrics.registry) kafkaMetrics.activeWorkersCounter = metrics.NewRegisteredCounter(fmt.Sprintf("WMsActiveWorkers-%s", consumerName), kafkaMetrics.registry) kafkaMetrics.pendingWMsTasksCounter = metrics.NewRegisteredCounter(fmt.Sprintf("WMsPendingTasks-%s", consumerName), kafkaMetrics.registry) kafkaMetrics.wmsBatchDurationTimer = metrics.NewRegisteredTimer(fmt.Sprintf("WMsBatchDuration-%s", consumerName), kafkaMetrics.registry) kafkaMetrics.wmsIdleTimer = metrics.NewRegisteredTimer(fmt.Sprintf("WMsIdleTime-%s", consumerName), kafkaMetrics.registry) return kafkaMetrics }
func newConsumerMetrics(consumerName, prefix string) *ConsumerMetrics { kafkaMetrics := &ConsumerMetrics{ registry: metrics.DefaultRegistry, } // Ensure prefix ends with a dot (.) so it plays nice with statsd/graphite prefix = strings.Trim(prefix, " ") if prefix != "" && prefix[len(prefix)-1:] != "." { prefix += "." } kafkaMetrics.fetchersIdleTimer = metrics.NewRegisteredTimer(fmt.Sprintf("%sFetchersIdleTime-%s", prefix, consumerName), kafkaMetrics.registry) kafkaMetrics.fetchDurationTimer = metrics.NewRegisteredTimer(fmt.Sprintf("%sFetchDuration-%s", prefix, consumerName), kafkaMetrics.registry) kafkaMetrics.numWorkerManagersGauge = metrics.NewRegisteredGauge(fmt.Sprintf("%sNumWorkerManagers-%s", prefix, consumerName), kafkaMetrics.registry) kafkaMetrics.activeWorkersCounter = metrics.NewRegisteredCounter(fmt.Sprintf("%sWMsActiveWorkers-%s", prefix, consumerName), kafkaMetrics.registry) kafkaMetrics.pendingWMsTasksCounter = metrics.NewRegisteredCounter(fmt.Sprintf("%sWMsPendingTasks-%s", prefix, consumerName), kafkaMetrics.registry) kafkaMetrics.taskTimeoutCounter = metrics.NewRegisteredCounter(fmt.Sprintf("%sTaskTimeouts-%s", prefix, consumerName), kafkaMetrics.registry) kafkaMetrics.wmsBatchDurationTimer = metrics.NewRegisteredTimer(fmt.Sprintf("%sWMsBatchDuration-%s", prefix, consumerName), kafkaMetrics.registry) kafkaMetrics.wmsIdleTimer = metrics.NewRegisteredTimer(fmt.Sprintf("%sWMsIdleTime-%s", prefix, consumerName), kafkaMetrics.registry) return kafkaMetrics }
func NewGaugePercent(name string, r metrics.Registry) *GaugePercent { return &GaugePercent{ Percent: metrics.NewRegisteredGaugeFloat64(name, r), Previous: metrics.NewRegisteredGauge(name+"-previous", metrics.NewRegistry()), } }
func NewMetricSwap(r metrics.Registry) MetricSwap { return MetricSwap{ Free: metrics.NewRegisteredGauge("swap.swap-free", r), Used: metrics.NewRegisteredGauge("swap.swap-used", r), } }
// dirWatcher watches for changes in the specified dir. The frequency of polling // is determined by the duration parameter. dirWatcher ensures: // * Each app's logs do not exceed the log limit threshold. If they do then the // oldest files are deleted. // * New encountered logs are reported to InfluxDB. func dirWatcher(duration time.Duration, dir string) { filesToState, appLogLevelToSpace, appLogLevelToCount, lastCompletedRun, err := getPreviousState() if err != nil { glog.Fatalf("Could get access previous state: %s", err) } appLogLevelToMetric := make(map[string]metrics.Gauge) updatedFiles := false markFn := func(path string, fileInfo os.FileInfo, err error) error { if err != nil { return err } if fileInfo.IsDir() || fileInfo.Mode()&os.ModeSymlink != 0 { // We are only interested in watching non-symlink log files in the // top-level dir. return nil } if _, exists := filesToState[path]; !exists || fileInfo.ModTime().After(lastCompletedRun) { glog.Infof("Processing %s", path) app, logLevel := getAppAndLogLevel(fileInfo) if app != "" && logLevel != "" { appLogLevel := fmt.Sprintf("%s.%s", app, logLevel) if _, ok := appLogLevelToMetric[appLogLevel]; !ok { // First time encountered this app and log level combination. // Create a counter metric. appLogLevelToMetric[appLogLevel] = metrics.NewRegisteredGauge("logserver."+appLogLevel, metrics.DefaultRegistry) } // Calculate how many new lines and new disk space usage there is. totalLines := getLineCount(path) totalSize := fileInfo.Size() newLines := totalLines newSpace := totalSize if exists { fileState := filesToState[path] newLines = totalLines - fileState.LineCount newSpace = totalSize - fileState.Size } glog.Infof("Processed %d new lines", newLines) glog.Infof("Processed %d new bytes", newSpace) // Update the logs count metric. appLogLevelToCount[appLogLevel] += newLines appLogLevelToMetric[appLogLevel].Update(appLogLevelToCount[appLogLevel]) // Add the file size to the current space count for this app and // log level combination. appLogLevelToSpace[appLogLevel] += newSpace updatedFiles = true } filesToState[path] = fileState{LineCount: getLineCount(path), Size: fileInfo.Size()} } return nil } for _ = range time.Tick(duration) { if err := filepath.Walk(dir, markFn); err != nil { glog.Fatal(err) } deletedFiles := cleanupAppLogs(dir, appLogLevelToSpace, filesToState) if updatedFiles || deletedFiles { if err := writeCurrentState(filesToState, appLogLevelToSpace, appLogLevelToCount, time.Now()); err != nil { glog.Fatalf("Could not write state: %s", err) } glog.Info(getPrettyMap(appLogLevelToCount, "AppLogLevels to their line counts")) glog.Info(getPrettyMap(appLogLevelToSpace, "AppLogLevels to their disk space")) } updatedFiles = false lastCompletedRun = time.Now() } }
func newGauge(name, suffix string) metrics.Gauge { return metrics.NewRegisteredGauge("ingester."+name+".gauge."+suffix, metrics.DefaultRegistry) }
) const ( CLUSTER_SIZE = 50 CLUSTER_STDDEV = 0.001 // TRACKED_ITEM_URL_TEMPLATE is used to generate the URL that is // embedded in an issue. It is also used to search for issues linked to a // specific item (cluster). The format verb is to be replaced with the ID // of the tracked item. TRACKED_ITEM_URL_TEMPLATE = "https://perf.skia.org/cl/%d" ) var ( // The number of clusters with a status of "New". newClustersGauge = metrics.NewRegisteredGauge("alerting.new", metrics.DefaultRegistry) // The number of times we've successfully done alert clustering. runsCounter = metrics.NewRegisteredCounter("alerting.runs", metrics.DefaultRegistry) // How long it takes to do a clustering run. alertingLatency = metrics.NewRegisteredTimer("alerting.latency", metrics.DefaultRegistry) // tileBuilder is the tracedb.Builder where we load Tiles from. tileBuilder *tracedb.Builder ) // CombineClusters combines freshly found clusters with existing clusters. // // Algorithm: // Run clustering and pick out the "Interesting" clusters.