Beispiel #1
0
func startMetrics(appName, graphiteServer string) {
	if graphiteServer == "" {
		glog.Warningf("No metrics server specified.")
		return
	}

	addr, err := net.ResolveTCPAddr("tcp", graphiteServer)
	if err != nil {
		glog.Fatalf("Unable to resolve metrics server address: %s", err)
	}

	// Get the hostname and create the app-prefix.
	hostName, err := os.Hostname()
	if err != nil {
		glog.Fatalf("Unable to retrieve hostname: %s", err)
	}
	appPrefix := fmt.Sprintf("%s.%s", appName, strings.Replace(hostName, ".", "-", -1))

	// Runtime metrics.
	metrics.RegisterRuntimeMemStats(metrics.DefaultRegistry)
	go metrics.CaptureRuntimeMemStats(metrics.DefaultRegistry, SAMPLE_PERIOD)
	go graphite.Graphite(metrics.DefaultRegistry, SAMPLE_PERIOD, appPrefix, addr)

	// Uptime.
	uptimeGuage := metrics.GetOrRegisterGaugeFloat64("uptime", metrics.DefaultRegistry)
	go func() {
		startTime := time.Now()
		uptimeGuage.Update(0)
		for _ = range time.Tick(SAMPLE_PERIOD) {
			uptimeGuage.Update(time.Since(startTime).Seconds())
		}
	}()
}
Beispiel #2
0
// startCtfeMetrics registers gauges with the graphite server that indicate CT is running healthily
// and starts a goroutine to update them periodically.
func startCtfeMetrics() {
	pendingTasksGauge := metrics.GetOrRegisterGauge("num-pending-tasks", metrics.DefaultRegistry)
	oldestPendingTaskAgeGauge := metrics.GetOrRegisterGaugeFloat64("oldest-pending-task-age", metrics.DefaultRegistry)
	// 0=no tasks pending; 1=started; 2=not started
	oldestPendingTaskStatusGauge := metrics.GetOrRegisterGauge("oldest-pending-task-status", metrics.DefaultRegistry)
	go func() {
		for _ = range time.Tick(common.SAMPLE_PERIOD) {
			pendingTaskCount, err := pending_tasks.GetPendingTaskCount()
			if err != nil {
				glog.Error(err)
			} else {
				pendingTasksGauge.Update(pendingTaskCount)
			}

			oldestPendingTask, err := pending_tasks.GetOldestPendingTask()
			if err != nil {
				glog.Error(err)
			} else if oldestPendingTask == nil {
				oldestPendingTaskAgeGauge.Update(0)
				oldestPendingTaskStatusGauge.Update(0)
			} else {
				addedTime := ctutil.GetTimeFromTs(strconv.FormatInt(oldestPendingTask.GetCommonCols().TsAdded.Int64, 10))
				oldestPendingTaskAgeGauge.Update(time.Since(addedTime).Seconds())
				if oldestPendingTask.GetCommonCols().TsStarted.Valid {
					oldestPendingTaskStatusGauge.Update(1)
				} else {
					oldestPendingTaskStatusGauge.Update(2)
				}
			}
		}
	}()
}
Beispiel #3
0
// StartMetrics registers gauges with the graphite server that indicate the poller is running
// healthily and starts a goroutine to update them periodically.
func (h *heartbeatStatusTracker) StartMetrics() {
	timeSinceLastUpdateGauge := metrics.GetOrRegisterGaugeFloat64("time-since-last-update", metrics.DefaultRegistry)
	healthyGauge := metrics.GetOrRegisterGauge("healthy", metrics.DefaultRegistry)
	timeSinceLastSuccess := make(map[TaskType]metrics.GaugeFloat64)
	timeSinceLastFailure := make(map[TaskType]metrics.GaugeFloat64)
	for t := UPDATE_AND_BUILD; t <= POLL; t++ {
		timeSinceLastSuccess[t] = metrics.GetOrRegisterGaugeFloat64(fmt.Sprintf("time-since-last-success-%s", t), metrics.DefaultRegistry)
		timeSinceLastFailure[t] = metrics.GetOrRegisterGaugeFloat64(fmt.Sprintf("time-since-last-failure-%s", t), metrics.DefaultRegistry)
	}
	go func() {
		for _ = range time.Tick(common.SAMPLE_PERIOD) {
			h.mu.Lock()
			timeSinceLastUpdate := time.Since(h.lastUpdate)
			currentStatus := h.currentStatus
			for t := UPDATE_AND_BUILD; t <= POLL; t++ {
				if v, ok := h.lastSuccessTime[t]; ok {
					timeSinceLastSuccess[t].Update(time.Since(v).Seconds())
				}
				if v, ok := h.lastFailureTime[t]; ok {
					timeSinceLastFailure[t].Update(time.Since(v).Seconds())
				}
			}
			lastSuccessfulPoll := h.lastSuccessTime[POLL]
			errs := h.errs
			h.errs = nil
			h.mu.Unlock()
			timeSinceLastUpdateGauge.Update(timeSinceLastUpdate.Seconds())
			expectPoll := false
			var expectedDuration time.Duration = 0
			switch currentStatus {
			case IDLE, POLL:
				expectPoll = true
				expectedDuration = *pollInterval
			case UPDATE_AND_BUILD:
				expectedDuration = ctutil.GIT_PULL_TIMEOUT + ctutil.MAKE_ALL_TIMEOUT
			case CHROMIUM_PERF:
				expectedDuration = ctutil.MASTER_SCRIPT_RUN_CHROMIUM_PERF_TIMEOUT
			case CAPTURE_SKPS:
				expectedDuration = ctutil.MASTER_SCRIPT_CAPTURE_SKPS_TIMEOUT
			case LUA_SCRIPT:
				expectedDuration = ctutil.MASTER_SCRIPT_RUN_LUA_TIMEOUT
			case CHROMIUM_BUILD:
				expectedDuration = ctutil.MASTER_SCRIPT_BUILD_CHROMIUM_TIMEOUT
			case RECREATE_PAGE_SETS:
				expectedDuration = ctutil.MASTER_SCRIPT_CREATE_PAGESETS_TIMEOUT
			case RECREATE_WEBPAGE_ARCHIVES:
				expectedDuration = ctutil.MASTER_SCRIPT_CAPTURE_ARCHIVES_TIMEOUT
			case CHECK_WORKER_HEALTH:
				expectedDuration = ctutil.CHECK_WORKERS_HEALTH_TIMEOUT
			}
			// Provide a bit of head room.
			expectedDuration += 2 * time.Minute

			if expectPoll && time.Since(lastSuccessfulPoll) > 2*time.Minute {
				errs = append(errs, fmt.Errorf("Last successful poll was at %s.", lastSuccessfulPoll))
			}
			if timeSinceLastUpdate > expectedDuration {
				errs = append(errs, fmt.Errorf("Task %s has not finished after %s.", currentStatus, timeSinceLastUpdate))
			}
			if len(errs) > 0 {
				for _, err := range errs {
					glog.Error(err)
				}
				healthyGauge.Update(0)
			} else {
				healthyGauge.Update(1)
			}
		}
	}()
}