Example #1
0
// startCtfeMetrics registers gauges with the graphite server that indicate CT is running healthily
// and starts a goroutine to update them periodically.
func startCtfeMetrics() {
	pendingTasksGauge := metrics.GetOrRegisterGauge("num-pending-tasks", metrics.DefaultRegistry)
	oldestPendingTaskAgeGauge := metrics.GetOrRegisterGaugeFloat64("oldest-pending-task-age", metrics.DefaultRegistry)
	// 0=no tasks pending; 1=started; 2=not started
	oldestPendingTaskStatusGauge := metrics.GetOrRegisterGauge("oldest-pending-task-status", metrics.DefaultRegistry)
	go func() {
		for _ = range time.Tick(common.SAMPLE_PERIOD) {
			pendingTaskCount, err := pending_tasks.GetPendingTaskCount()
			if err != nil {
				glog.Error(err)
			} else {
				pendingTasksGauge.Update(pendingTaskCount)
			}

			oldestPendingTask, err := pending_tasks.GetOldestPendingTask()
			if err != nil {
				glog.Error(err)
			} else if oldestPendingTask == nil {
				oldestPendingTaskAgeGauge.Update(0)
				oldestPendingTaskStatusGauge.Update(0)
			} else {
				addedTime := ctutil.GetTimeFromTs(strconv.FormatInt(oldestPendingTask.GetCommonCols().TsAdded.Int64, 10))
				oldestPendingTaskAgeGauge.Update(time.Since(addedTime).Seconds())
				if oldestPendingTask.GetCommonCols().TsStarted.Valid {
					oldestPendingTaskStatusGauge.Update(1)
				} else {
					oldestPendingTaskStatusGauge.Update(2)
				}
			}
		}
	}()
}
// monitorStatus sets up the monitoring routine, which reports how big the work queues are
// and how many processes are up.
func (agg *BinaryAggregator) monitorStatus(numAnalysisProcesses, numUploadProcesses int) {
	defer agg.monitoringWaitGroup.Done()
	analysisProcessCount := go_metrics.GetOrRegisterCounter("analysis_process_count", go_metrics.DefaultRegistry)
	analysisProcessCount.Clear()
	analysisProcessCount.Inc(int64(numAnalysisProcesses))
	uploadProcessCount := go_metrics.GetOrRegisterCounter("upload_process_count", go_metrics.DefaultRegistry)
	uploadProcessCount.Clear()
	uploadProcessCount.Inc(int64(numUploadProcesses))

	t := time.Tick(config.Aggregator.StatusPeriod)
	for {
		select {
		case <-agg.monitoringShutdown:
			glog.Infof("aggregator monitor got signal to shut down")
			return
		case <-t:
			go_metrics.GetOrRegisterGauge("binary_analysis_queue_size", go_metrics.DefaultRegistry).Update(int64(len(agg.forAnalysis)))
			go_metrics.GetOrRegisterGauge("binary_upload_queue_size", go_metrics.DefaultRegistry).Update(int64(len(agg.forUpload)))
			go_metrics.GetOrRegisterGauge("binary_bug_report_queue_size", go_metrics.DefaultRegistry).Update(int64(len(agg.forBugReporting)))
		}
	}
}
Example #3
0
func NewHeartbeatStatusTracker() StatusTracker {
	h := &heartbeatStatusTracker{}
	h.currentStatusGauge = metrics.GetOrRegisterGauge("current-status", metrics.DefaultRegistry)
	h.taskDurations = make(map[TaskType]metrics.Histogram)
	for t := UPDATE_AND_BUILD; t <= POLL; t++ {
		// Using the values from metrics.NewTimer().
		s := metrics.NewExpDecaySample(1028, 0.015)
		h.taskDurations[t] = metrics.GetOrRegisterHistogram(fmt.Sprintf("duration-%s", t), metrics.DefaultRegistry, s)
	}
	h.lastSuccessTime = make(map[TaskType]time.Time)
	h.lastFailureTime = make(map[TaskType]time.Time)
	return h
}
Example #4
0
func NewMetrics() *ServerMetrics {
	m := new(ServerMetrics)

	m.qCounter = metrics.GetOrRegisterMeter("queries", nil)
	m.lastQueryCount = m.qCounter.Count()

	m.queriesHistogram = metrics.GetOrRegisterHistogram(
		"queries-histogram", nil,
		metrics.NewExpDecaySample(600, 0.015),
	)

	m.goroutines = metrics.GetOrRegisterGauge("goroutines", nil)

	return m
}
// scanHelper runs findBadBinaryPaths, logs the output and keeps alreadyFoundBinaries up to date.
func (agg *BinaryAggregator) scanHelper(alreadyFoundBinaries *SortedStringSlice) error {
	newlyFound, err := findBadBinaryPaths(alreadyFoundBinaries)
	if err != nil {
		return err
	}
	// AFL-fuzz does not write crashes or hangs atomically, so this workaround waits for a bit after
	// we have references to where the crashes will be.
	// TODO(kjlubick), switch to using flock once afl-fuzz implements that upstream.
	time.Sleep(time.Second)
	go_metrics.GetOrRegisterGauge("binary_newly_found_fuzzes", go_metrics.DefaultRegistry).Update(int64(len(newlyFound)))
	glog.Infof("%d newly found bad binary fuzzes", len(newlyFound))
	for _, f := range newlyFound {
		agg.forAnalysis <- f
	}
	alreadyFoundBinaries.Append(newlyFound)
	return nil
}
Example #6
0
func (g *goMetricRegistry) Gauge(name string) accounting.Gauge {
	return metrics.GetOrRegisterGauge(name, metrics.DefaultRegistry)
}
Example #7
0
// StartMetrics registers gauges with the graphite server that indicate the poller is running
// healthily and starts a goroutine to update them periodically.
func (h *heartbeatStatusTracker) StartMetrics() {
	timeSinceLastUpdateGauge := metrics.GetOrRegisterGaugeFloat64("time-since-last-update", metrics.DefaultRegistry)
	healthyGauge := metrics.GetOrRegisterGauge("healthy", metrics.DefaultRegistry)
	timeSinceLastSuccess := make(map[TaskType]metrics.GaugeFloat64)
	timeSinceLastFailure := make(map[TaskType]metrics.GaugeFloat64)
	for t := UPDATE_AND_BUILD; t <= POLL; t++ {
		timeSinceLastSuccess[t] = metrics.GetOrRegisterGaugeFloat64(fmt.Sprintf("time-since-last-success-%s", t), metrics.DefaultRegistry)
		timeSinceLastFailure[t] = metrics.GetOrRegisterGaugeFloat64(fmt.Sprintf("time-since-last-failure-%s", t), metrics.DefaultRegistry)
	}
	go func() {
		for _ = range time.Tick(common.SAMPLE_PERIOD) {
			h.mu.Lock()
			timeSinceLastUpdate := time.Since(h.lastUpdate)
			currentStatus := h.currentStatus
			for t := UPDATE_AND_BUILD; t <= POLL; t++ {
				if v, ok := h.lastSuccessTime[t]; ok {
					timeSinceLastSuccess[t].Update(time.Since(v).Seconds())
				}
				if v, ok := h.lastFailureTime[t]; ok {
					timeSinceLastFailure[t].Update(time.Since(v).Seconds())
				}
			}
			lastSuccessfulPoll := h.lastSuccessTime[POLL]
			errs := h.errs
			h.errs = nil
			h.mu.Unlock()
			timeSinceLastUpdateGauge.Update(timeSinceLastUpdate.Seconds())
			expectPoll := false
			var expectedDuration time.Duration = 0
			switch currentStatus {
			case IDLE, POLL:
				expectPoll = true
				expectedDuration = *pollInterval
			case UPDATE_AND_BUILD:
				expectedDuration = ctutil.GIT_PULL_TIMEOUT + ctutil.MAKE_ALL_TIMEOUT
			case CHROMIUM_PERF:
				expectedDuration = ctutil.MASTER_SCRIPT_RUN_CHROMIUM_PERF_TIMEOUT
			case CAPTURE_SKPS:
				expectedDuration = ctutil.MASTER_SCRIPT_CAPTURE_SKPS_TIMEOUT
			case LUA_SCRIPT:
				expectedDuration = ctutil.MASTER_SCRIPT_RUN_LUA_TIMEOUT
			case CHROMIUM_BUILD:
				expectedDuration = ctutil.MASTER_SCRIPT_BUILD_CHROMIUM_TIMEOUT
			case RECREATE_PAGE_SETS:
				expectedDuration = ctutil.MASTER_SCRIPT_CREATE_PAGESETS_TIMEOUT
			case RECREATE_WEBPAGE_ARCHIVES:
				expectedDuration = ctutil.MASTER_SCRIPT_CAPTURE_ARCHIVES_TIMEOUT
			case CHECK_WORKER_HEALTH:
				expectedDuration = ctutil.CHECK_WORKERS_HEALTH_TIMEOUT
			}
			// Provide a bit of head room.
			expectedDuration += 2 * time.Minute

			if expectPoll && time.Since(lastSuccessfulPoll) > 2*time.Minute {
				errs = append(errs, fmt.Errorf("Last successful poll was at %s.", lastSuccessfulPoll))
			}
			if timeSinceLastUpdate > expectedDuration {
				errs = append(errs, fmt.Errorf("Task %s has not finished after %s.", currentStatus, timeSinceLastUpdate))
			}
			if len(errs) > 0 {
				for _, err := range errs {
					glog.Error(err)
				}
				healthyGauge.Update(0)
			} else {
				healthyGauge.Update(1)
			}
		}
	}()
}
Example #8
0
func main() {
	defer common.LogPanic()

	// Global init to initialize glog and parse arguments.
	common.InitWithMetrics("datahopper", graphiteServer)

	// Shared repo objects.
	skiaRepo, err := gitinfo.CloneOrUpdate(SKIA_REPO, path.Join(*workdir, "datahopper_skia"), true)
	if err != nil {
		glog.Fatal(err)
	}
	infraRepo, err := gitinfo.CloneOrUpdate(INFRA_REPO, path.Join(*workdir, "datahopper_infra"), true)
	if err != nil {
		glog.Fatal(err)
	}
	go func() {
		for _ = range time.Tick(5 * time.Minute) {
			if err := skiaRepo.Update(true, true); err != nil {
				glog.Errorf("Failed to sync Skia repo: %v", err)
			}
			if err := infraRepo.Update(true, true); err != nil {
				glog.Errorf("Failed to sync Infra repo: %v", err)
			}
		}
	}()

	// Data generation goroutines.
	db, err := buildbot.NewLocalDB(path.Join(*workdir, "buildbot.db"))
	if err != nil {
		glog.Fatal(err)
	}

	// Buildbot data ingestion.
	if err := buildbot.IngestNewBuildsLoop(db, *workdir); err != nil {
		glog.Fatal(err)
	}

	// Run a server for the buildbot data.
	if err := buildbot.RunBuildServer(*grpcPort, db); err != nil {
		glog.Fatal(err)
	}

	// Measure buildbot data ingestion progress.
	totalGuage := go_metrics.GetOrRegisterGauge("buildbot.builds.total", go_metrics.DefaultRegistry)
	ingestGuage := go_metrics.GetOrRegisterGauge("buildbot.builds.ingested", go_metrics.DefaultRegistry)
	go func() {
		for _ = range time.Tick(common.SAMPLE_PERIOD) {
			totalBuilds, err := buildbot.NumTotalBuilds()
			if err != nil {
				glog.Error(err)
				continue
			}
			ingestedBuilds, err := db.NumIngestedBuilds()
			if err != nil {
				glog.Error(err)
				continue
			}
			totalGuage.Update(int64(totalBuilds))
			ingestGuage.Update(int64(ingestedBuilds))
		}
	}()

	// Average build and step time.
	go func() {
		period := 24 * time.Hour
		for _ = range time.Tick(10 * time.Minute) {
			glog.Info("Loading build and buildstep duration data.")
			end := time.Now().UTC()
			start := end.Add(-period)
			builds, err := db.GetBuildsFromDateRange(start, end)
			if err != nil {
				glog.Errorf("Failed to obtain build and buildstep duration data: %s", err)
				continue
			}
			for _, b := range builds {
				if !b.IsFinished() {
					continue
				}
				// Report build time.
				// app.host.measurement.measurement.builder.measurement*
				d := b.Finished.Sub(b.Started)
				metric := fmt.Sprintf("buildbot.builds.%s.duration", fixName(b.Builder))
				metrics.GetOrRegisterSlidingWindow(metric, metrics.DEFAULT_WINDOW).Update(int64(d))
				for _, s := range b.Steps {
					if !s.IsFinished() {
						continue
					}
					// app.host.measurement.measurement.builder.step.measurement*
					d := s.Finished.Sub(s.Started)
					metric := fmt.Sprintf("buildbot.buildstepsbybuilder.%s.%s.duration", fixName(b.Builder), fixName(s.Name))
					metrics.GetOrRegisterSlidingWindow(metric, metrics.DEFAULT_WINDOW).Update(int64(d))
				}
			}
		}
	}()

	// Number of commits in the repo.
	go func() {
		skiaGauge := go_metrics.GetOrRegisterGauge("repo.skia.commits", go_metrics.DefaultRegistry)
		infraGauge := go_metrics.GetOrRegisterGauge("repo.infra.commits", go_metrics.DefaultRegistry)
		for _ = range time.Tick(5 * time.Minute) {
			skiaGauge.Update(int64(skiaRepo.NumCommits()))
			infraGauge.Update(int64(infraRepo.NumCommits()))
		}
	}()

	// Run a backup server.
	go func() {
		glog.Fatal(buildbot.RunBackupServer(db, *httpPort))
	}()

	// Wait while the above goroutines generate data.
	select {}
}