// startCtfeMetrics registers gauges with the graphite server that indicate CT is running healthily // and starts a goroutine to update them periodically. func startCtfeMetrics() { pendingTasksGauge := metrics.GetOrRegisterGauge("num-pending-tasks", metrics.DefaultRegistry) oldestPendingTaskAgeGauge := metrics.GetOrRegisterGaugeFloat64("oldest-pending-task-age", metrics.DefaultRegistry) // 0=no tasks pending; 1=started; 2=not started oldestPendingTaskStatusGauge := metrics.GetOrRegisterGauge("oldest-pending-task-status", metrics.DefaultRegistry) go func() { for _ = range time.Tick(common.SAMPLE_PERIOD) { pendingTaskCount, err := pending_tasks.GetPendingTaskCount() if err != nil { glog.Error(err) } else { pendingTasksGauge.Update(pendingTaskCount) } oldestPendingTask, err := pending_tasks.GetOldestPendingTask() if err != nil { glog.Error(err) } else if oldestPendingTask == nil { oldestPendingTaskAgeGauge.Update(0) oldestPendingTaskStatusGauge.Update(0) } else { addedTime := ctutil.GetTimeFromTs(strconv.FormatInt(oldestPendingTask.GetCommonCols().TsAdded.Int64, 10)) oldestPendingTaskAgeGauge.Update(time.Since(addedTime).Seconds()) if oldestPendingTask.GetCommonCols().TsStarted.Valid { oldestPendingTaskStatusGauge.Update(1) } else { oldestPendingTaskStatusGauge.Update(2) } } } }() }
// monitorStatus sets up the monitoring routine, which reports how big the work queues are // and how many processes are up. func (agg *BinaryAggregator) monitorStatus(numAnalysisProcesses, numUploadProcesses int) { defer agg.monitoringWaitGroup.Done() analysisProcessCount := go_metrics.GetOrRegisterCounter("analysis_process_count", go_metrics.DefaultRegistry) analysisProcessCount.Clear() analysisProcessCount.Inc(int64(numAnalysisProcesses)) uploadProcessCount := go_metrics.GetOrRegisterCounter("upload_process_count", go_metrics.DefaultRegistry) uploadProcessCount.Clear() uploadProcessCount.Inc(int64(numUploadProcesses)) t := time.Tick(config.Aggregator.StatusPeriod) for { select { case <-agg.monitoringShutdown: glog.Infof("aggregator monitor got signal to shut down") return case <-t: go_metrics.GetOrRegisterGauge("binary_analysis_queue_size", go_metrics.DefaultRegistry).Update(int64(len(agg.forAnalysis))) go_metrics.GetOrRegisterGauge("binary_upload_queue_size", go_metrics.DefaultRegistry).Update(int64(len(agg.forUpload))) go_metrics.GetOrRegisterGauge("binary_bug_report_queue_size", go_metrics.DefaultRegistry).Update(int64(len(agg.forBugReporting))) } } }
func NewHeartbeatStatusTracker() StatusTracker { h := &heartbeatStatusTracker{} h.currentStatusGauge = metrics.GetOrRegisterGauge("current-status", metrics.DefaultRegistry) h.taskDurations = make(map[TaskType]metrics.Histogram) for t := UPDATE_AND_BUILD; t <= POLL; t++ { // Using the values from metrics.NewTimer(). s := metrics.NewExpDecaySample(1028, 0.015) h.taskDurations[t] = metrics.GetOrRegisterHistogram(fmt.Sprintf("duration-%s", t), metrics.DefaultRegistry, s) } h.lastSuccessTime = make(map[TaskType]time.Time) h.lastFailureTime = make(map[TaskType]time.Time) return h }
func NewMetrics() *ServerMetrics { m := new(ServerMetrics) m.qCounter = metrics.GetOrRegisterMeter("queries", nil) m.lastQueryCount = m.qCounter.Count() m.queriesHistogram = metrics.GetOrRegisterHistogram( "queries-histogram", nil, metrics.NewExpDecaySample(600, 0.015), ) m.goroutines = metrics.GetOrRegisterGauge("goroutines", nil) return m }
// scanHelper runs findBadBinaryPaths, logs the output and keeps alreadyFoundBinaries up to date. func (agg *BinaryAggregator) scanHelper(alreadyFoundBinaries *SortedStringSlice) error { newlyFound, err := findBadBinaryPaths(alreadyFoundBinaries) if err != nil { return err } // AFL-fuzz does not write crashes or hangs atomically, so this workaround waits for a bit after // we have references to where the crashes will be. // TODO(kjlubick), switch to using flock once afl-fuzz implements that upstream. time.Sleep(time.Second) go_metrics.GetOrRegisterGauge("binary_newly_found_fuzzes", go_metrics.DefaultRegistry).Update(int64(len(newlyFound))) glog.Infof("%d newly found bad binary fuzzes", len(newlyFound)) for _, f := range newlyFound { agg.forAnalysis <- f } alreadyFoundBinaries.Append(newlyFound) return nil }
func (g *goMetricRegistry) Gauge(name string) accounting.Gauge { return metrics.GetOrRegisterGauge(name, metrics.DefaultRegistry) }
// StartMetrics registers gauges with the graphite server that indicate the poller is running // healthily and starts a goroutine to update them periodically. func (h *heartbeatStatusTracker) StartMetrics() { timeSinceLastUpdateGauge := metrics.GetOrRegisterGaugeFloat64("time-since-last-update", metrics.DefaultRegistry) healthyGauge := metrics.GetOrRegisterGauge("healthy", metrics.DefaultRegistry) timeSinceLastSuccess := make(map[TaskType]metrics.GaugeFloat64) timeSinceLastFailure := make(map[TaskType]metrics.GaugeFloat64) for t := UPDATE_AND_BUILD; t <= POLL; t++ { timeSinceLastSuccess[t] = metrics.GetOrRegisterGaugeFloat64(fmt.Sprintf("time-since-last-success-%s", t), metrics.DefaultRegistry) timeSinceLastFailure[t] = metrics.GetOrRegisterGaugeFloat64(fmt.Sprintf("time-since-last-failure-%s", t), metrics.DefaultRegistry) } go func() { for _ = range time.Tick(common.SAMPLE_PERIOD) { h.mu.Lock() timeSinceLastUpdate := time.Since(h.lastUpdate) currentStatus := h.currentStatus for t := UPDATE_AND_BUILD; t <= POLL; t++ { if v, ok := h.lastSuccessTime[t]; ok { timeSinceLastSuccess[t].Update(time.Since(v).Seconds()) } if v, ok := h.lastFailureTime[t]; ok { timeSinceLastFailure[t].Update(time.Since(v).Seconds()) } } lastSuccessfulPoll := h.lastSuccessTime[POLL] errs := h.errs h.errs = nil h.mu.Unlock() timeSinceLastUpdateGauge.Update(timeSinceLastUpdate.Seconds()) expectPoll := false var expectedDuration time.Duration = 0 switch currentStatus { case IDLE, POLL: expectPoll = true expectedDuration = *pollInterval case UPDATE_AND_BUILD: expectedDuration = ctutil.GIT_PULL_TIMEOUT + ctutil.MAKE_ALL_TIMEOUT case CHROMIUM_PERF: expectedDuration = ctutil.MASTER_SCRIPT_RUN_CHROMIUM_PERF_TIMEOUT case CAPTURE_SKPS: expectedDuration = ctutil.MASTER_SCRIPT_CAPTURE_SKPS_TIMEOUT case LUA_SCRIPT: expectedDuration = ctutil.MASTER_SCRIPT_RUN_LUA_TIMEOUT case CHROMIUM_BUILD: expectedDuration = ctutil.MASTER_SCRIPT_BUILD_CHROMIUM_TIMEOUT case RECREATE_PAGE_SETS: expectedDuration = ctutil.MASTER_SCRIPT_CREATE_PAGESETS_TIMEOUT case RECREATE_WEBPAGE_ARCHIVES: expectedDuration = ctutil.MASTER_SCRIPT_CAPTURE_ARCHIVES_TIMEOUT case CHECK_WORKER_HEALTH: expectedDuration = ctutil.CHECK_WORKERS_HEALTH_TIMEOUT } // Provide a bit of head room. expectedDuration += 2 * time.Minute if expectPoll && time.Since(lastSuccessfulPoll) > 2*time.Minute { errs = append(errs, fmt.Errorf("Last successful poll was at %s.", lastSuccessfulPoll)) } if timeSinceLastUpdate > expectedDuration { errs = append(errs, fmt.Errorf("Task %s has not finished after %s.", currentStatus, timeSinceLastUpdate)) } if len(errs) > 0 { for _, err := range errs { glog.Error(err) } healthyGauge.Update(0) } else { healthyGauge.Update(1) } } }() }
func main() { defer common.LogPanic() // Global init to initialize glog and parse arguments. common.InitWithMetrics("datahopper", graphiteServer) // Shared repo objects. skiaRepo, err := gitinfo.CloneOrUpdate(SKIA_REPO, path.Join(*workdir, "datahopper_skia"), true) if err != nil { glog.Fatal(err) } infraRepo, err := gitinfo.CloneOrUpdate(INFRA_REPO, path.Join(*workdir, "datahopper_infra"), true) if err != nil { glog.Fatal(err) } go func() { for _ = range time.Tick(5 * time.Minute) { if err := skiaRepo.Update(true, true); err != nil { glog.Errorf("Failed to sync Skia repo: %v", err) } if err := infraRepo.Update(true, true); err != nil { glog.Errorf("Failed to sync Infra repo: %v", err) } } }() // Data generation goroutines. db, err := buildbot.NewLocalDB(path.Join(*workdir, "buildbot.db")) if err != nil { glog.Fatal(err) } // Buildbot data ingestion. if err := buildbot.IngestNewBuildsLoop(db, *workdir); err != nil { glog.Fatal(err) } // Run a server for the buildbot data. if err := buildbot.RunBuildServer(*grpcPort, db); err != nil { glog.Fatal(err) } // Measure buildbot data ingestion progress. totalGuage := go_metrics.GetOrRegisterGauge("buildbot.builds.total", go_metrics.DefaultRegistry) ingestGuage := go_metrics.GetOrRegisterGauge("buildbot.builds.ingested", go_metrics.DefaultRegistry) go func() { for _ = range time.Tick(common.SAMPLE_PERIOD) { totalBuilds, err := buildbot.NumTotalBuilds() if err != nil { glog.Error(err) continue } ingestedBuilds, err := db.NumIngestedBuilds() if err != nil { glog.Error(err) continue } totalGuage.Update(int64(totalBuilds)) ingestGuage.Update(int64(ingestedBuilds)) } }() // Average build and step time. go func() { period := 24 * time.Hour for _ = range time.Tick(10 * time.Minute) { glog.Info("Loading build and buildstep duration data.") end := time.Now().UTC() start := end.Add(-period) builds, err := db.GetBuildsFromDateRange(start, end) if err != nil { glog.Errorf("Failed to obtain build and buildstep duration data: %s", err) continue } for _, b := range builds { if !b.IsFinished() { continue } // Report build time. // app.host.measurement.measurement.builder.measurement* d := b.Finished.Sub(b.Started) metric := fmt.Sprintf("buildbot.builds.%s.duration", fixName(b.Builder)) metrics.GetOrRegisterSlidingWindow(metric, metrics.DEFAULT_WINDOW).Update(int64(d)) for _, s := range b.Steps { if !s.IsFinished() { continue } // app.host.measurement.measurement.builder.step.measurement* d := s.Finished.Sub(s.Started) metric := fmt.Sprintf("buildbot.buildstepsbybuilder.%s.%s.duration", fixName(b.Builder), fixName(s.Name)) metrics.GetOrRegisterSlidingWindow(metric, metrics.DEFAULT_WINDOW).Update(int64(d)) } } } }() // Number of commits in the repo. go func() { skiaGauge := go_metrics.GetOrRegisterGauge("repo.skia.commits", go_metrics.DefaultRegistry) infraGauge := go_metrics.GetOrRegisterGauge("repo.infra.commits", go_metrics.DefaultRegistry) for _ = range time.Tick(5 * time.Minute) { skiaGauge.Update(int64(skiaRepo.NumCommits())) infraGauge.Update(int64(infraRepo.NumCommits())) } }() // Run a backup server. go func() { glog.Fatal(buildbot.RunBackupServer(db, *httpPort)) }() // Wait while the above goroutines generate data. select {} }