// MasterAlerts returns alerts generated from the master. func (a *Analyzer) MasterAlerts(master string, be *messages.BuildExtract) []messages.Alert { ret := []messages.Alert{} // Copied logic from builder_messages. // No created_timestamp should be a warning sign, no? if be.CreatedTimestamp == messages.EpochTime(0) { return ret } elapsed := a.Now().Sub(be.CreatedTimestamp.Time()) if elapsed > a.StaleMasterThreshold { ret = append(ret, messages.Alert{ Key: fmt.Sprintf("stale master: %v", master), Title: fmt.Sprintf("Stale %s master data", master), Body: fmt.Sprintf("%s elapsed since last update.", elapsed), StartTime: messages.TimeToEpochTime(be.CreatedTimestamp.Time()), Severity: staleMasterSev, Time: messages.TimeToEpochTime(a.Now()), Links: []messages.Link{{"Master", client.MasterURL(master)}}, // No type or extension for now. }) } if elapsed < 0 { // Add this to the alerts returned, rather than just log it? log.Errorf("Master %s timestamp is newer than current time (%s): %s old.", master, a.Now(), elapsed) } return ret }
// latestBuildStep returns the latest build step name and update time, and an error // if there were any errors. func (a *Analyzer) latestBuildStep(b *messages.Build) (lastStep string, lastUpdate messages.EpochTime, err error) { if len(b.Steps) == 0 { return "", messages.TimeToEpochTime(a.Now()), errNoBuildSteps } if len(b.Times) > 1 && b.Times[1] != 0 { return StepCompletedRun, b.Times[1], nil } for _, step := range b.Steps { if len(step.Times) > 1 && step.Times[1] > lastUpdate { // Step is done. lastUpdate = step.Times[1] lastStep = step.Name } else if len(step.Times) > 0 && step.Times[0] > lastUpdate { // Step has started. lastUpdate = step.Times[0] lastStep = step.Name } } return }
func TestLatestBuildStep(t *testing.T) { tests := []struct { name string b messages.Build wantStep string wantUpdate messages.EpochTime wantErr error }{ { name: "blank", wantErr: errNoBuildSteps, }, { name: "done time is latest", b: messages.Build{ Steps: []messages.Step{ { Name: "done step", Times: []messages.EpochTime{ messages.TimeToEpochTime(time.Unix(6, 0)), messages.TimeToEpochTime(time.Unix(42, 0)), }, }, }, }, wantStep: "done step", wantUpdate: messages.TimeToEpochTime(time.Unix(42, 0)), }, { name: "started time is latest", b: messages.Build{ Steps: []messages.Step{ { Name: "start step", Times: []messages.EpochTime{ messages.TimeToEpochTime(time.Unix(42, 0)), messages.TimeToEpochTime(time.Unix(0, 0)), }, }, }, }, wantStep: "start step", wantUpdate: messages.TimeToEpochTime(time.Unix(42, 0)), }, { name: "started time is latest, multiple steps", b: messages.Build{ Steps: []messages.Step{ { Name: "start step", Times: []messages.EpochTime{ messages.TimeToEpochTime(time.Unix(6, 0)), messages.TimeToEpochTime(time.Unix(7, 0)), }, }, { Name: "second step", Times: []messages.EpochTime{ messages.TimeToEpochTime(time.Unix(42, 0)), messages.TimeToEpochTime(time.Unix(0, 0)), }, }, }, }, wantStep: "second step", wantUpdate: messages.TimeToEpochTime(time.Unix(42, 0)), }, { name: "done time is latest, multiple steps", b: messages.Build{ Steps: []messages.Step{ { Name: "start step", Times: []messages.EpochTime{ messages.TimeToEpochTime(time.Unix(0, 0)), messages.TimeToEpochTime(time.Unix(6, 0)), }, }, { Name: "second step", Times: []messages.EpochTime{ messages.TimeToEpochTime(time.Unix(7, 0)), messages.TimeToEpochTime(time.Unix(42, 0)), }, }, }, }, wantStep: "second step", wantUpdate: messages.TimeToEpochTime(time.Unix(42, 0)), }, { name: "build is done", b: messages.Build{ Times: []messages.EpochTime{ messages.TimeToEpochTime(time.Unix(0, 0)), messages.TimeToEpochTime(time.Unix(42, 0)), }, Steps: []messages.Step{ { Name: "start step", Times: []messages.EpochTime{ messages.TimeToEpochTime(time.Unix(0, 0)), messages.TimeToEpochTime(time.Unix(0, 0)), }, }, { Name: "second step", Times: []messages.EpochTime{ messages.TimeToEpochTime(time.Unix(6, 0)), messages.TimeToEpochTime(time.Unix(7, 0)), }, }, }, }, wantStep: StepCompletedRun, wantUpdate: messages.TimeToEpochTime(time.Unix(42, 0)), }, } a := New(&mockReader{}, 0, 10) a.now = fakeNow(time.Unix(0, 0)) for _, test := range tests { gotStep, gotUpdate, gotErr := a.latestBuildStep(&test.b) if gotStep != test.wantStep { t.Errorf("%s failed. Got %q, want %q.", test.name, gotStep, test.wantStep) } if gotUpdate != test.wantUpdate { t.Errorf("%s failed. Got %v, want %v.", test.name, gotUpdate, test.wantUpdate) } if gotErr != test.wantErr { t.Errorf("%s failed. Got %s, want %s.", test.name, gotErr, test.wantErr) } } }
func TestMasterAlerts(t *testing.T) { tests := []struct { name string master string be messages.BuildExtract t time.Time want []messages.Alert }{ { name: "empty", master: "fake-empty", want: []messages.Alert{}, }, { name: "Not stale master", master: "fake-not-stale", be: messages.BuildExtract{ CreatedTimestamp: messages.EpochTime(100), }, t: time.Unix(100, 0), want: []messages.Alert{}, }, { name: "Stale master", master: "fake.master", be: messages.BuildExtract{ CreatedTimestamp: messages.EpochTime(100), }, t: time.Unix(100, 0).Add(20 * time.Minute), want: []messages.Alert{ { Key: "stale master: fake.master", Title: "Stale fake.master master data", Body: fmt.Sprintf("%s elapsed since last update.", 20*time.Minute), Time: messages.TimeToEpochTime(time.Unix(100, 0).Add(20 * time.Minute)), Links: []messages.Link{{"Master", client.MasterURL("fake.master")}}, StartTime: messages.EpochTime(100), }, }, }, { name: "Future master", master: "fake.master", be: messages.BuildExtract{ CreatedTimestamp: messages.EpochTime(110), }, t: time.Unix(100, 0), want: []messages.Alert{}, }, } a := New(&mockReader{}, 0, 10) for _, test := range tests { a.now = fakeNow(test.t) got := a.MasterAlerts(test.master, &test.be) if !reflect.DeepEqual(got, test.want) { t.Errorf("%s failed. Got %+v, want: %+v", test.name, got, test.want) } } }
func TestLittleBBuilderAlerts(t *testing.T) { tests := []struct { name string master string builder string b messages.Builder builds map[string]*messages.Build time time.Time wantAlerts []messages.Alert wantErrs []error }{ { name: "empty", wantErrs: []error{errNoRecentBuilds}, }, { name: "builders ok", master: "fake.master", builder: "fake.builder", builds: map[string]*messages.Build{ "fake.master/fake.builder/0": { Steps: []messages.Step{ { Name: "fake_step", Times: []messages.EpochTime{ messages.TimeToEpochTime(time.Unix(10, 0)), messages.TimeToEpochTime(time.Unix(100, 0)), }, }, }, Times: []messages.EpochTime{ messages.TimeToEpochTime(time.Unix(10, 0)), messages.TimeToEpochTime(time.Unix(100, 0)), }, }, }, b: messages.Builder{ BuilderName: "fake.builder", CachedBuilds: []int64{0}, }, wantAlerts: []messages.Alert{}, wantErrs: []error{}, }, { name: "builder building for too long", master: "fake.master", builder: "fake.builder", builds: map[string]*messages.Build{ "fake.master/fake.builder/0": { Number: 0, Times: []messages.EpochTime{ messages.TimeToEpochTime(time.Unix(10, 0)), messages.TimeToEpochTime(time.Unix(100, 0)), }, Steps: []messages.Step{ { Name: "fake_step", Times: []messages.EpochTime{ messages.TimeToEpochTime(time.Unix(10, 0)), messages.TimeToEpochTime(time.Unix(100, 0)), }, }, }, }, "fake.master/fake.builder/1": { Number: 1, Steps: []messages.Step{ { Name: "fake_step", Times: []messages.EpochTime{ messages.TimeToEpochTime(time.Unix(10, 0)), messages.TimeToEpochTime(time.Unix(0, 0)), }, }, }, }, }, b: messages.Builder{ State: messages.StateBuilding, BuilderName: "fake.builder", CachedBuilds: []int64{0, 1}, }, time: time.Unix(0, 0).Add(4 * time.Hour), wantAlerts: []messages.Alert{ { Key: "fake.master.fake.builder.hung", Title: "fake.master.fake.builder is hung in step fake_step.", Time: messages.TimeToEpochTime(time.Unix(0, 0).Add(4 * time.Hour)), Body: "fake.master.fake.builder has been building for 3h59m50s (last step update 1970-01-01 00:00:10 +0000 UTC), past the alerting threshold of 3h0m0s", Severity: 1, Links: []messages.Link{ {Title: "Builder", Href: "https://build.chromium.org/p/fake.master/builders/fake.builder"}, {Title: "Last build", Href: "https://build.chromium.org/p/fake.master/builders/fake.builder/builds/1"}, {Title: "Last build step", Href: "https://build.chromium.org/p/fake.master/builders/fake.builder/builds/1/steps/fake_step"}, }, }, }, wantErrs: []error{}, }, } a := New(nil, 0, 10) for _, test := range tests { a.now = fakeNow(test.time) a.Reader = mockReader{ builds: test.builds, } gotAlerts, gotErrs := a.builderAlerts(test.master, test.builder, &test.b) if !reflect.DeepEqual(gotAlerts, test.wantAlerts) { t.Errorf("%s failed. Got:\n%+v, want:\n%+v", test.name, gotAlerts, test.wantAlerts) } if !reflect.DeepEqual(gotErrs, test.wantErrs) { t.Errorf("%s failed. Got %+v, want: %+v", test.name, gotErrs, test.wantErrs) } } }
func main() { start := time.Now() flag.Parse() duration, err := time.ParseDuration(*durationStr) if err != nil { log.Errorf("Error parsing duration: %v", err) os.Exit(1) } cycle, err := time.ParseDuration(*cycleStr) if err != nil { log.Errorf("Error parsing cycle: %v", err) os.Exit(1) } err = readJSONFile(*gatekeeperJSON, &gk) if err != nil { log.Errorf("Error reading gatekeeper json: %v", err) os.Exit(1) } err = readJSONFile(*gatekeeperTreesJSON, &gkt) if err != nil { log.Errorf("Error reading gatekeeper json: %v", err) os.Exit(1) } if *snapshot != "" && *replay != "" { log.Errorf("Cannot use snapshot and replay flags at the same time.") os.Exit(1) } r := client.NewReader() if *snapshot != "" { r = client.NewSnapshot(r, *snapshot) } if *replay != "" { r = client.NewReplay(*replay) } a := analyzer.New(r, 5, 100) for masterURL, masterCfgs := range gk.Masters { if len(masterCfgs) != 1 { log.Errorf("Multiple configs for master: %s", masterURL) } masterName := masterFromURL(masterURL) a.MasterCfgs[masterName] = masterCfgs[0] } a.MasterOnly = *masterOnly a.BuilderOnly = *builderOnly a.BuildOnly = *buildOnly trees := map[string]bool{} if *treesOnly != "" { for _, treeOnly := range strings.Split(*treesOnly, ",") { trees[treeOnly] = true } } else { for treeName := range gkt { trees[treeName] = true } } for tree := range trees { if _, ok := gkt[tree]; !ok { log.Errorf("Unrecognized tree name: %s", tree) os.Exit(1) } } // This is the polling/analysis/alert posting function, which will run in a loop until // a timeout or max errors is reached. f := func(ctx context.Context) error { done := make(chan interface{}) errs := make(chan error) for treeName := range trees { go func(tree string) { log.Infof("Checking tree: %s", tree) masterNames := []string{} t := gkt[tree] for _, url := range t.Masters { masterNames = append(masterNames, masterFromURL(url)) } // TODO(seanmccullough): Plumb ctx through the rest of these calls. bes := fetchBuildExtracts(a.Reader, masterNames) log.Infof("Build Extracts read: %d", len(bes)) alerts := &messages.Alerts{} for masterName, be := range bes { alerts.Alerts = append(alerts.Alerts, analyzeBuildExtract(a, masterName, be)...) } alerts.Timestamp = messages.TimeToEpochTime(time.Now()) if *alertsBaseURL == "" { log.Infof("No data_url provided. Writing to %s-alerts.json", tree) abytes, err := json.MarshalIndent(alerts, "", "\t") if err != nil { log.Errorf("Couldn't marshal alerts json: %v", err) errs <- err return } if err := ioutil.WriteFile(fmt.Sprintf("%s-alerts.json", tree), abytes, 0644); err != nil { log.Errorf("Couldn't write to alerts.json: %v", err) errs <- err return } } else { alertsURL := fmt.Sprintf("%s/%s", *alertsBaseURL, tree) w := client.NewWriter(alertsURL) log.Infof("Posting alerts to %s", alertsURL) err := w.PostAlerts(alerts) if err != nil { log.Errorf("Couldn't post alerts: %v", err) errs <- err return } } log.Infof("Filtered failures: %v", filteredFailures) a.Reader.DumpStats() log.Infof("Elapsed time: %v", time.Since(start)) done <- nil }(treeName) } for range trees { select { case err := <-errs: return err case <-done: } } return nil } ctx, cancel := context.WithTimeout(context.Background(), duration) logging.Set(ctx, log) defer cancel() loopResults := looper.Run(ctx, f, cycle, *maxErrs, clock.GetSystemClock()) if !loopResults.Success { log.Errorf("Failed to run loop, %v errors", loopResults.Errs) os.Exit(1) } }
// TODO: also check the build slaves to see if there are alerts for currently running builds that // haven't shown up in CBE yet. func (a *Analyzer) builderAlerts(masterName string, builderName string, b *messages.Builder) ([]messages.Alert, []error) { if len(b.CachedBuilds) == 0 { // TODO: Make an alert for this? return nil, []error{errNoRecentBuilds} } recentBuildIDs := b.CachedBuilds // Should be a *reverse* sort. sort.Sort(buildNums(recentBuildIDs)) if len(recentBuildIDs) > a.MaxRecentBuilds { recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds] } alerts, errs := []messages.Alert{}, []error{} lastBuild, lastCompletedBuild, err := a.lastBuilds(masterName, builderName, recentBuildIDs) if err != nil { errs = append(errs, err) return nil, errs } // Examining only the latest build is probably suboptimal since if it's still in progress it might // not have hit a step that is going to fail and has failed repeatedly for the last few builds. // AKA "Reliable failures". TODO: Identify "Reliable failures" lastStep, lastUpdated, err := a.latestBuildStep(lastBuild) if err != nil { errs = append(errs, fmt.Errorf("Couldn't get latest build step for %s.%s: %v", masterName, builderName, err)) return alerts, errs } elapsed := a.Now().Sub(lastUpdated.Time()) links := []messages.Link{ {"Builder", client.BuilderURL(masterName, builderName)}, {"Last build", client.BuildURL(masterName, builderName, lastBuild.Number)}, {"Last build step", client.StepURL(masterName, builderName, lastStep, lastBuild.Number)}, } switch b.State { case messages.StateBuilding: if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun { alerts = append(alerts, messages.Alert{ Key: fmt.Sprintf("%s.%s.hung", masterName, builderName), Title: fmt.Sprintf("%s.%s is hung in step %s.", masterName, builderName, lastStep), Body: fmt.Sprintf("%s.%s has been building for %v (last step update %s), past the alerting threshold of %v", masterName, builderName, elapsed, lastUpdated.Time(), a.HungBuilderThresh), Severity: hungBuilderSev, Time: messages.TimeToEpochTime(a.Now()), Links: links, }) // Note, just because it's building doesn't mean it's in a good state. If the last N builds // all failed (for some large N) then this might still be alertable. } case messages.StateOffline: if elapsed > a.OfflineBuilderThresh { alerts = append(alerts, messages.Alert{ Key: fmt.Sprintf("%s.%s.offline", masterName, builderName), Title: fmt.Sprintf("%s.%s is offline.", masterName, builderName), Body: fmt.Sprintf("%s.%s has been offline for %v (last step update %s %v), past the alerting threshold of %v", masterName, builderName, elapsed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderThresh), Severity: offlineBuilderSev, Time: messages.TimeToEpochTime(a.Now()), Links: links, }) } case messages.StateIdle: if b.PendingBuilds > a.IdleBuilderCountThresh { alerts = append(alerts, messages.Alert{ Key: fmt.Sprintf("%s.%s.idle", masterName, builderName), Title: fmt.Sprintf("%s.%s is idle with too many pending builds.", masterName, builderName), Body: fmt.Sprintf("%s.%s is idle with %d pending builds, past the alerting threshold of %d", masterName, builderName, b.PendingBuilds, a.IdleBuilderCountThresh), Severity: idleBuilderSev, Time: messages.TimeToEpochTime(a.Now()), Links: links, }) } default: log.Errorf("Unknown %s.%s builder state: %s", masterName, builderName, b.State) } // Check for alerts on the most recent complete build log.Infof("Checking %d most recent builds for alertable step failures: %s/%s", len(recentBuildIDs), masterName, builderName) as, es := a.builderStepAlerts(masterName, builderName, []int64{lastCompletedBuild.Number}) if len(as) > 0 { mostRecentComplete := 0 for i, id := range recentBuildIDs { if id == lastCompletedBuild.Number { mostRecentComplete = i } } as, es = a.builderStepAlerts(masterName, builderName, recentBuildIDs[mostRecentComplete:]) alerts = append(alerts, as...) errs = append(errs, es...) } return alerts, errs }