func startMetrics(appName, graphiteServer string) { if graphiteServer == "" { glog.Warningf("No metrics server specified.") return } addr, err := net.ResolveTCPAddr("tcp", graphiteServer) if err != nil { glog.Fatalf("Unable to resolve metrics server address: %s", err) } // Get the hostname and create the app-prefix. hostName, err := os.Hostname() if err != nil { glog.Fatalf("Unable to retrieve hostname: %s", err) } appPrefix := fmt.Sprintf("%s.%s", appName, strings.Replace(hostName, ".", "-", -1)) // Runtime metrics. metrics.RegisterRuntimeMemStats(metrics.DefaultRegistry) go metrics.CaptureRuntimeMemStats(metrics.DefaultRegistry, SAMPLE_PERIOD) go graphite.Graphite(metrics.DefaultRegistry, SAMPLE_PERIOD, appPrefix, addr) // Uptime. uptimeGuage := metrics.GetOrRegisterGaugeFloat64("uptime", metrics.DefaultRegistry) go func() { startTime := time.Now() uptimeGuage.Update(0) for _ = range time.Tick(SAMPLE_PERIOD) { uptimeGuage.Update(time.Since(startTime).Seconds()) } }() }
// startCtfeMetrics registers gauges with the graphite server that indicate CT is running healthily // and starts a goroutine to update them periodically. func startCtfeMetrics() { pendingTasksGauge := metrics.GetOrRegisterGauge("num-pending-tasks", metrics.DefaultRegistry) oldestPendingTaskAgeGauge := metrics.GetOrRegisterGaugeFloat64("oldest-pending-task-age", metrics.DefaultRegistry) // 0=no tasks pending; 1=started; 2=not started oldestPendingTaskStatusGauge := metrics.GetOrRegisterGauge("oldest-pending-task-status", metrics.DefaultRegistry) go func() { for _ = range time.Tick(common.SAMPLE_PERIOD) { pendingTaskCount, err := pending_tasks.GetPendingTaskCount() if err != nil { glog.Error(err) } else { pendingTasksGauge.Update(pendingTaskCount) } oldestPendingTask, err := pending_tasks.GetOldestPendingTask() if err != nil { glog.Error(err) } else if oldestPendingTask == nil { oldestPendingTaskAgeGauge.Update(0) oldestPendingTaskStatusGauge.Update(0) } else { addedTime := ctutil.GetTimeFromTs(strconv.FormatInt(oldestPendingTask.GetCommonCols().TsAdded.Int64, 10)) oldestPendingTaskAgeGauge.Update(time.Since(addedTime).Seconds()) if oldestPendingTask.GetCommonCols().TsStarted.Valid { oldestPendingTaskStatusGauge.Update(1) } else { oldestPendingTaskStatusGauge.Update(2) } } } }() }
// StartMetrics registers gauges with the graphite server that indicate the poller is running // healthily and starts a goroutine to update them periodically. func (h *heartbeatStatusTracker) StartMetrics() { timeSinceLastUpdateGauge := metrics.GetOrRegisterGaugeFloat64("time-since-last-update", metrics.DefaultRegistry) healthyGauge := metrics.GetOrRegisterGauge("healthy", metrics.DefaultRegistry) timeSinceLastSuccess := make(map[TaskType]metrics.GaugeFloat64) timeSinceLastFailure := make(map[TaskType]metrics.GaugeFloat64) for t := UPDATE_AND_BUILD; t <= POLL; t++ { timeSinceLastSuccess[t] = metrics.GetOrRegisterGaugeFloat64(fmt.Sprintf("time-since-last-success-%s", t), metrics.DefaultRegistry) timeSinceLastFailure[t] = metrics.GetOrRegisterGaugeFloat64(fmt.Sprintf("time-since-last-failure-%s", t), metrics.DefaultRegistry) } go func() { for _ = range time.Tick(common.SAMPLE_PERIOD) { h.mu.Lock() timeSinceLastUpdate := time.Since(h.lastUpdate) currentStatus := h.currentStatus for t := UPDATE_AND_BUILD; t <= POLL; t++ { if v, ok := h.lastSuccessTime[t]; ok { timeSinceLastSuccess[t].Update(time.Since(v).Seconds()) } if v, ok := h.lastFailureTime[t]; ok { timeSinceLastFailure[t].Update(time.Since(v).Seconds()) } } lastSuccessfulPoll := h.lastSuccessTime[POLL] errs := h.errs h.errs = nil h.mu.Unlock() timeSinceLastUpdateGauge.Update(timeSinceLastUpdate.Seconds()) expectPoll := false var expectedDuration time.Duration = 0 switch currentStatus { case IDLE, POLL: expectPoll = true expectedDuration = *pollInterval case UPDATE_AND_BUILD: expectedDuration = ctutil.GIT_PULL_TIMEOUT + ctutil.MAKE_ALL_TIMEOUT case CHROMIUM_PERF: expectedDuration = ctutil.MASTER_SCRIPT_RUN_CHROMIUM_PERF_TIMEOUT case CAPTURE_SKPS: expectedDuration = ctutil.MASTER_SCRIPT_CAPTURE_SKPS_TIMEOUT case LUA_SCRIPT: expectedDuration = ctutil.MASTER_SCRIPT_RUN_LUA_TIMEOUT case CHROMIUM_BUILD: expectedDuration = ctutil.MASTER_SCRIPT_BUILD_CHROMIUM_TIMEOUT case RECREATE_PAGE_SETS: expectedDuration = ctutil.MASTER_SCRIPT_CREATE_PAGESETS_TIMEOUT case RECREATE_WEBPAGE_ARCHIVES: expectedDuration = ctutil.MASTER_SCRIPT_CAPTURE_ARCHIVES_TIMEOUT case CHECK_WORKER_HEALTH: expectedDuration = ctutil.CHECK_WORKERS_HEALTH_TIMEOUT } // Provide a bit of head room. expectedDuration += 2 * time.Minute if expectPoll && time.Since(lastSuccessfulPoll) > 2*time.Minute { errs = append(errs, fmt.Errorf("Last successful poll was at %s.", lastSuccessfulPoll)) } if timeSinceLastUpdate > expectedDuration { errs = append(errs, fmt.Errorf("Task %s has not finished after %s.", currentStatus, timeSinceLastUpdate)) } if len(errs) > 0 { for _, err := range errs { glog.Error(err) } healthyGauge.Update(0) } else { healthyGauge.Update(1) } } }() }