// monitorIssueTracker reads the counts for all the types of issues in the skia // issue tracker (code.google.com/p/skia) and stuffs the counts into Graphite. func monitorIssueTracker() { c := &http.Client{ Transport: &http.Transport{ Dial: dialTimeout, }, } if *useMetadata { *apikey = metadata.Must(metadata.ProjectGet(metadata.APIKEY)) } // Create a new metrics registry for the issue tracker metrics. addr, err := net.ResolveTCPAddr("tcp", *graphiteServer) if err != nil { glog.Fatalln("Failed to resolve the Graphite server: ", err) } issueRegistry := metrics.NewRegistry() go graphite.Graphite(issueRegistry, common.SAMPLE_PERIOD, "issues", addr) // IssueStatus has all the info we need to capture and record a single issue status. I.e. capture // the count of all issues with a status of "New". type IssueStatus struct { Name string Metric metrics.Gauge URL string } allIssueStatusLabels := []string{ "New", "Accepted", "Unconfirmed", "Started", "Fixed", "Verified", "Invalid", "WontFix", "Done", "Available", "Assigned", } issueStatus := []*IssueStatus{} for _, issueName := range allIssueStatusLabels { issueStatus = append(issueStatus, &IssueStatus{ Name: issueName, Metric: metrics.NewRegisteredGauge(strings.ToLower(issueName), issueRegistry), URL: "https://www.googleapis.com/projecthosting/v2/projects/skia/issues?fields=totalResults&key=" + *apikey + "&status=" + issueName, }) } liveness := imetrics.NewLiveness("issue-tracker") for _ = range time.Tick(ISSUE_TRACKER_PERIOD) { for _, issue := range issueStatus { resp, err := c.Get(issue.URL) jsonResp := map[string]int64{} dec := json.NewDecoder(resp.Body) if err := dec.Decode(&jsonResp); err != nil { glog.Warningf("Failed to decode JSON response: %s", err) util.Close(resp.Body) continue } issue.Metric.Update(jsonResp["totalResults"]) glog.Infof("Num Issues: %s - %d", issue.Name, jsonResp["totalResults"]) if err == nil && resp.Body != nil { util.Close(resp.Body) } } liveness.Update() } }
// NewTraceServiceDB creates a new DB that stores the data in the BoltDB backed // gRPC accessible traceservice. func NewTraceServiceDB(conn *grpc.ClientConn, traceBuilder tiling.TraceBuilder) (*TsDB, error) { ret := &TsDB{ conn: conn, traceService: traceservice.NewTraceServiceClient(conn), traceBuilder: traceBuilder, cache: lru.New(MAX_ID_CACHED), ctx: context.Background(), } // This ping causes the client to try and reach the backend. If the backend // is down, it will keep trying until it's up. if err := ret.ping(); err != nil { return nil, err } go func() { liveness := metrics.NewLiveness("tracedb-ping") for _ = range time.Tick(time.Minute) { if ret.ping() == nil { liveness.Update() } } }() return ret, nil }
// NewBuilder creates a new Builder given the gitinfo, and loads Tiles from the // traceserver running at the given address. The tiles contain the last // 'tileSize' commits and are built from Traces of the type that traceBuilder // returns. func NewBuilder(git *gitinfo.GitInfo, address string, tileSize int, traceBuilder tiling.TraceBuilder) (*Builder, error) { conn, err := grpc.Dial(address, grpc.WithInsecure()) if err != nil { return nil, fmt.Errorf("did not connect: %v", err) } // Build a tracedb.DB client. tracedb, err := NewTraceServiceDB(conn, traceBuilder) if err != nil { return nil, fmt.Errorf("NewTraceStore: Failed to create DB: %s", err) } ret := &Builder{ tileSize: tileSize, DB: tracedb, git: git, } if err := ret.LoadTile(); err != nil { return nil, fmt.Errorf("NewTraceStore: Failed to load initial Tile: %s", err) } go func() { liveness := metrics.NewLiveness("perf-tracedb-tile-refresh") for _ = range time.Tick(TILE_REFRESH_DURATION) { if err := ret.LoadTile(); err != nil { glog.Errorf("Failed to refresh tile: %s", err) } else { liveness.Update() } } }() return ret, nil }
// monitorIssueTracker reads the counts for all the types of issues in the Skia // issue tracker (bugs.chromium.org/p/skia) and stuffs the counts into Graphite. func monitorIssueTracker(c *http.Client) { // Create a new metrics registry for the issue tracker metrics. addr, err := net.ResolveTCPAddr("tcp", *graphiteServer) if err != nil { glog.Fatalln("Failed to resolve the Graphite server: ", err) } issueRegistry := metrics.NewRegistry() go graphite.Graphite(issueRegistry, common.SAMPLE_PERIOD, "issues", addr) // IssueStatus has all the info we need to capture and record a single issue status. I.e. capture // the count of all issues with a status of "New". type IssueStatus struct { Name string Metric metrics.Gauge URL string } allIssueStatusLabels := []string{ "New", "Accepted", "Unconfirmed", "Started", "Fixed", "Verified", "Invalid", "WontFix", "Done", "Available", "Assigned", } issueStatus := []*IssueStatus{} for _, issueName := range allIssueStatusLabels { q := url.Values{} q.Set("fields", "totalResults") q.Set("status", issueName) issueStatus = append(issueStatus, &IssueStatus{ Name: issueName, Metric: metrics.NewRegisteredGauge(strings.ToLower(issueName), issueRegistry), URL: issues.MONORAIL_BASE_URL + "?" + q.Encode(), }) } liveness := imetrics.NewLiveness("issue-tracker") for _ = range time.Tick(ISSUE_TRACKER_PERIOD) { for _, issue := range issueStatus { resp, err := c.Get(issue.URL) if err != nil { glog.Errorf("Failed to retrieve response from %s: %s", issue.URL, err) continue } jsonResp := map[string]int64{} dec := json.NewDecoder(resp.Body) if err := dec.Decode(&jsonResp); err != nil { glog.Warningf("Failed to decode JSON response: %s", err) util.Close(resp.Body) continue } issue.Metric.Update(jsonResp["totalResults"]) glog.Infof("Num Issues: %s - %d", issue.Name, jsonResp["totalResults"]) if err == nil && resp.Body != nil { util.Close(resp.Body) } } liveness.Update() } }
func main() { defer common.LogPanic() // Setup flags. dbConf := buildbot.DBConfigFromFlags() // Global init. common.InitWithMetrics(APP_NAME, graphiteServer) // Parse the time period. period, err := human.ParseDuration(*timePeriod) if err != nil { glog.Fatal(err) } // Initialize the buildbot database. if !*local { if err := dbConf.GetPasswordFromMetadata(); err != nil { glog.Fatal(err) } } if err := dbConf.InitDB(); err != nil { glog.Fatal(err) } // Initialize the BuildBucket client. c, err := auth.NewClient(*local, path.Join(*workdir, "oauth_token_cache"), buildbucket.DEFAULT_SCOPES...) if err != nil { glog.Fatal(err) } bb := buildbucket.NewClient(c) // Build the queue. repos := gitinfo.NewRepoMap(*workdir) for _, r := range REPOS { if _, err := repos.Repo(r); err != nil { glog.Fatal(err) } } q, err := build_queue.NewBuildQueue(period, repos, *scoreThreshold, *scoreDecay24Hr, BOT_BLACKLIST) if err != nil { glog.Fatal(err) } // Start scheduling builds in a loop. liveness := metrics.NewLiveness(APP_NAME) if err := scheduleBuilds(q, bb); err != nil { glog.Errorf("Failed to schedule builds: %v", err) } for _ = range time.Tick(time.Minute) { liveness.Update() if err := scheduleBuilds(q, bb); err != nil { glog.Errorf("Failed to schedule builds: %v", err) } } }
// IngestNewBuildsLoop continually ingests new builds. func IngestNewBuildsLoop(workdir string) { lv := metrics.NewLiveness("buildbot-ingest") repos := gitinfo.NewRepoMap(workdir) for _ = range time.Tick(30 * time.Second) { glog.Info("Ingesting builds.") if err := ingestNewBuilds(repos); err != nil { glog.Errorf("Failed to ingest new builds: %v", err) } else { lv.Update() } } }
// newSourceMetrics instantiates a set of metrics for an input source. func newSourceMetrics(id string, sources []Source) []*sourceMetrics { ret := make([]*sourceMetrics, len(sources)) for idx, source := range sources { prefix := fmt.Sprintf("%s.%s", id, source.ID()) ret[idx] = &sourceMetrics{ liveness: smetrics.NewLiveness(prefix + ".poll-liveness"), pollTimer: metrics.NewRegisteredTimer(prefix+".poll-timer", metrics.DefaultRegistry), pollError: metrics.NewRegisteredGauge(prefix+".poll-error", metrics.DefaultRegistry), eventsReceived: metrics.NewRegisteredMeter(prefix+".events-received", metrics.DefaultRegistry), } } return ret }
func main() { defer common.LogPanic() common.InitWithMetrics("probeserver", graphiteServer) client, err := auth.NewDefaultJWTServiceAccountClient("https://www.googleapis.com/auth/userinfo.email") if err != nil { glog.Fatalf("Failed to create client for talking to the issue tracker: %s", err) } go monitorIssueTracker(client) glog.Infoln("Looking for Graphite server.") addr, err := net.ResolveTCPAddr("tcp", *graphiteServer) if err != nil { glog.Fatalln("Failed to resolve the Graphite server: ", err) } glog.Infoln("Found Graphite server.") liveness := imetrics.NewLiveness("probes") // We have two sets of metrics, one for the probes and one for the probe // server itself. The server's metrics are handled by common.Init() probeRegistry := metrics.NewRegistry() go graphite.Graphite(probeRegistry, common.SAMPLE_PERIOD, *prefix, addr) // TODO(jcgregorio) Monitor config file and reload if it changes. cfg, err := readConfigFiles(*config) if err != nil { glog.Fatalln("Failed to read config file: ", err) } glog.Infoln("Successfully read config file.") // Register counters for each probe. for name, probe := range cfg { probe.failure = metrics.NewRegisteredGauge(name+".failure", probeRegistry) probe.latency = metrics.NewRegisteredGauge(name+".latency", probeRegistry) } // Create a client that uses our dialer with a timeout. c := &http.Client{ Transport: &http.Transport{ Dial: dialTimeout, }, } probeOneRound(cfg, c) for _ = range time.Tick(*runEvery) { probeOneRound(cfg, c) liveness.Update() } }
// IngestNewBuildsLoop continually ingests new builds. func IngestNewBuildsLoop(workdir string) { repos := gitinfo.NewRepoMap(workdir) var wg sync.WaitGroup for _, m := range MASTER_NAMES { go func(master string) { defer wg.Done() lv := metrics.NewLiveness(fmt.Sprintf("buildbot-ingest-%s", master)) for _ = range time.Tick(30 * time.Second) { if err := ingestNewBuilds(master, repos); err != nil { glog.Errorf("Failed to ingest new builds: %v", err) } else { lv.Update() } } }(m) } wg.Wait() }
// NewTraceServiceDB creates a new DB that stores the data in the BoltDB backed // gRPC accessible traceservice. func NewTraceServiceDB(conn *grpc.ClientConn, traceBuilder tiling.TraceBuilder) (*TsDB, error) { ret := &TsDB{ conn: conn, traceService: traceservice.NewTraceServiceClient(conn), traceBuilder: traceBuilder, cache: lru.New(MAX_ID_CACHED), paramsCache: map[string]map[string]string{}, id64Cache: map[uint64]string{}, ctx: context.Background(), } // This ping causes the client to try and reach the backend. If the backend // is down, it will keep trying until it's up. if err := ret.ping(); err != nil { return nil, err } // Liveness metric. go func() { liveness := metrics.NewLiveness("tracedb-ping") for _ = range time.Tick(time.Minute) { if ret.ping() == nil { liveness.Update() } } }() // Keep the caches sizes in check. go func() { for _ = range time.Tick(15 * time.Minute) { ret.clearMutex.Lock() if len(ret.paramsCache) > MAX_ID_CACHED { ret.paramsCache = map[string]map[string]string{} glog.Warning("Had to clear paramsCache, this is unexpected. MAX_ID_CACHED too small?") } if len(ret.id64Cache) > MAX_ID_CACHED { ret.id64Cache = map[uint64]string{} glog.Warning("Had to clear id64Cache, this is unexpected. MAX_ID_CACHED too small?") } ret.clearMutex.Unlock() } }() return ret, nil }
func (s *StatusWatcher) calcAndWatchStatus() error { expChanges := make(chan []string) s.storages.EventBus.SubscribeAsync(expstorage.EV_EXPSTORAGE_CHANGED, func(e interface{}) { expChanges <- e.([]string) }) tileStream := s.storages.GetTileStreamNow(2*time.Minute, false) lastTile := <-tileStream if err := s.calcStatus(lastTile); err != nil { return err } liveness := imetrics.NewLiveness("status-monitoring") go func() { for { select { case <-tileStream: tile, err := s.storages.GetLastTileTrimmed(false) if err != nil { glog.Errorf("Error retrieving tile: %s", err) continue } if err := s.calcStatus(tile); err != nil { glog.Errorf("Error calculating status: %s", err) } else { lastTile = tile liveness.Update() } case <-expChanges: storage.DrainChangeChannel(expChanges) if err := s.calcStatus(lastTile); err != nil { glog.Errorf("Error calculating tile after expectation update: %s", err) } liveness.Update() } } }() return nil }
// StartMonitoring starts a new monitoring routine for the given // ignore store that counts expired ignore rules and pushes // that info into a metric. func Init(store IgnoreStore) error { numExpired := metrics.NewRegisteredGauge("num-expired-ignore-rules", metrics.DefaultRegistry) liveness := imetrics.NewLiveness("expired-ignore-rules-monitoring") err := oneStep(store, numExpired) if err != nil { return fmt.Errorf("Unable to start monitoring ignore rules: %s", err) } go func() { for _ = range time.Tick(time.Minute) { err = oneStep(store, numExpired) if err != nil { glog.Errorf("Failed one step of monitoring ignore rules: %s", err) continue } liveness.Update() } }() return nil }
func (h *historian) start() error { expChanges := make(chan []string) h.storages.EventBus.SubscribeAsync(expstorage.EV_EXPSTORAGE_CHANGED, func(e interface{}) { expChanges <- e.([]string) }) tileStream := h.storages.GetTileStreamNow(2*time.Minute, true) lastTile := <-tileStream if err := h.updateDigestInfo(lastTile); err != nil { return err } liveness := metrics.NewLiveness("digest-history-monitoring") // Keep processing tiles and feed them into the process channel. go func() { for { select { case tile := <-tileStream: if err := h.updateDigestInfo(tile); err != nil { glog.Errorf("Error calculating status: %s", err) continue } else { lastTile = tile } case <-expChanges: storage.DrainChangeChannel(expChanges) if err := h.updateDigestInfo(lastTile); err != nil { glog.Errorf("Error calculating tile after expectation udpate: %s", err) continue } } liveness.Update() } }() return nil }
// IngestNewBuildsLoop continually ingests new builds. func IngestNewBuildsLoop(db DB, workdir string) error { if _, ok := db.(*localDB); !ok { return fmt.Errorf("Can only ingest builds with a local DB instance.") } repos := gitinfo.NewRepoMap(workdir) go func() { var wg sync.WaitGroup for _, m := range MASTER_NAMES { go func(master string) { defer wg.Done() lv := metrics.NewLiveness(fmt.Sprintf("buildbot-ingest-%s", master)) for _ = range time.Tick(10 * time.Second) { if err := ingestNewBuilds(db.(*localDB), master, repos); err != nil { glog.Errorf("Failed to ingest new builds: %s", err) } else { lv.Update() } } }(m) } wg.Wait() }() return nil }
// NewBuilder creates a new Builder given the gitinfo, and loads Tiles from the // traceserver running at the given address. The tiles contain the last // 'tileSize' commits and are built from Traces of the type that traceBuilder // returns. func NewMasterTileBuilder(db DB, git *gitinfo.GitInfo, tileSize int, evt *eventbus.EventBus) (MasterTileBuilder, error) { ret := &masterTileBuilder{ tileSize: tileSize, db: db, git: git, evt: evt, } if err := ret.LoadTile(); err != nil { return nil, fmt.Errorf("NewTraceStore: Failed to load initial Tile: %s", err) } evt.Publish(NEW_TILE_AVAILABLE_EVENT, ret.GetTile()) go func() { liveness := metrics.NewLiveness("perf-tracedb-tile-refresh") for _ = range time.Tick(TILE_REFRESH_DURATION) { if err := ret.LoadTile(); err != nil { glog.Errorf("Failed to refresh tile: %s", err) } else { liveness.Update() evt.Publish(NEW_TILE_AVAILABLE_EVENT, ret.GetTile()) } } }() return ret, nil }
local = flag.Bool("local", false, "Running locally if true. As opposed to in production.") oauthCacheFile = flag.String("oauth_cache_file", "", "Path to the OAuth credential cache file.") targetList = flag.String("targets", "", "The targets to monitor, a space separated list.") codenameDbDir = flag.String("codename_db_dir", "codenames", "The location of the leveldb database that holds the mappings between targets and their codenames.") period = flag.Duration("period", 5*time.Minute, "The time between ingestion runs.") ) var ( // terminal_build_status are the tradefed build status's that mean the build is done. terminal_build_status = []string{"complete", "error"} // codenameDB is a leveldb to store codenames and their deobfuscated counterparts. codenameDB *leveldb.DB // liveness is a metric for the time since last successful run through step(). liveness = skmetics.NewLiveness("android_internal_ingest") ) // isFinished returns true if the Build has finished running. func isFinished(b *androidbuildinternal.Build) bool { return util.In(b.BuildAttemptStatus, terminal_build_status) } // buildFromCommit builds a buildbot.Build from the commit and the info // returned from the Apiary API. It also returns a key that uniqely identifies // this build. func buildFromCommit(build *androidbuildinternal.Build, commit *vcsinfo.ShortCommit) (string, *buildbot.Build) { codename := util.StringToCodeName(build.Target.Name) key := build.Branch + ":" + build.Target.Name + ":" + build.BuildId b := &buildbot.Build{ Builder: codename,