// TODO: also check the build slaves to see if there are alerts for currently running builds that // haven't shown up in CBE yet. func (a *Analyzer) builderAlerts(masterName string, builderName string, b *messages.Builder) ([]messages.Alert, []error) { if len(b.CachedBuilds) == 0 { // TODO: Make an alert for this? return nil, []error{errNoRecentBuilds} } recentBuildIDs := b.CachedBuilds // Should be a *reverse* sort. sort.Sort(buildNums(recentBuildIDs)) if len(recentBuildIDs) > a.MaxRecentBuilds { recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds] } alerts, errs := []messages.Alert{}, []error{} lastBuild, lastCompletedBuild, err := a.lastBuilds(masterName, builderName, recentBuildIDs) if err != nil { errs = append(errs, err) return nil, errs } // Examining only the latest build is probably suboptimal since if it's still in progress it might // not have hit a step that is going to fail and has failed repeatedly for the last few builds. // AKA "Reliable failures". TODO: Identify "Reliable failures" lastStep, lastUpdated, err := a.latestBuildStep(lastBuild) if err != nil { errs = append(errs, fmt.Errorf("Couldn't get latest build step for %s.%s: %v", masterName, builderName, err)) return alerts, errs } elapsed := a.Now().Sub(lastUpdated.Time()) links := []messages.Link{ {"Builder", client.BuilderURL(masterName, builderName)}, {"Last build", client.BuildURL(masterName, builderName, lastBuild.Number)}, {"Last build step", client.StepURL(masterName, builderName, lastStep, lastBuild.Number)}, } switch b.State { case messages.StateBuilding: if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun { alerts = append(alerts, messages.Alert{ Key: fmt.Sprintf("%s.%s.hung", masterName, builderName), Title: fmt.Sprintf("%s.%s is hung in step %s.", masterName, builderName, lastStep), Body: fmt.Sprintf("%s.%s has been building for %v (last step update %s), past the alerting threshold of %v", masterName, builderName, elapsed, lastUpdated.Time(), a.HungBuilderThresh), Severity: hungBuilderSev, Time: messages.TimeToEpochTime(a.Now()), Links: links, }) // Note, just because it's building doesn't mean it's in a good state. If the last N builds // all failed (for some large N) then this might still be alertable. } case messages.StateOffline: if elapsed > a.OfflineBuilderThresh { alerts = append(alerts, messages.Alert{ Key: fmt.Sprintf("%s.%s.offline", masterName, builderName), Title: fmt.Sprintf("%s.%s is offline.", masterName, builderName), Body: fmt.Sprintf("%s.%s has been offline for %v (last step update %s %v), past the alerting threshold of %v", masterName, builderName, elapsed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderThresh), Severity: offlineBuilderSev, Time: messages.TimeToEpochTime(a.Now()), Links: links, }) } case messages.StateIdle: if b.PendingBuilds > a.IdleBuilderCountThresh { alerts = append(alerts, messages.Alert{ Key: fmt.Sprintf("%s.%s.idle", masterName, builderName), Title: fmt.Sprintf("%s.%s is idle with too many pending builds.", masterName, builderName), Body: fmt.Sprintf("%s.%s is idle with %d pending builds, past the alerting threshold of %d", masterName, builderName, b.PendingBuilds, a.IdleBuilderCountThresh), Severity: idleBuilderSev, Time: messages.TimeToEpochTime(a.Now()), Links: links, }) } default: log.Errorf("Unknown %s.%s builder state: %s", masterName, builderName, b.State) } // Check for alerts on the most recent complete build log.Infof("Checking %d most recent builds for alertable step failures: %s/%s", len(recentBuildIDs), masterName, builderName) as, es := a.builderStepAlerts(masterName, builderName, []int64{lastCompletedBuild.Number}) if len(as) > 0 { mostRecentComplete := 0 for i, id := range recentBuildIDs { if id == lastCompletedBuild.Number { mostRecentComplete = i } } as, es = a.builderStepAlerts(masterName, builderName, recentBuildIDs[mostRecentComplete:]) alerts = append(alerts, as...) errs = append(errs, es...) } return alerts, errs }
// stepFailureAlerts returns alerts generated from step failures. It applies filtering // logic specified in the gatekeeper config to ignore some failures. func (a *Analyzer) stepFailureAlerts(failures []stepFailure) ([]messages.Alert, error) { ret := []messages.Alert{} type res struct { f stepFailure a *messages.Alert err error } // Might not need full capacity buffer, since some failures are ignored below. rs := make(chan res, len(failures)) scannedFailures := []stepFailure{} for _, failure := range failures { // goroutine/channel because the reasonsForFailure call potentially // blocks on IO. if failure.step.Name == "steps" { // check results to see if it's an array of [4] // That's a purple failure, which should go to infra/trooper. log.Infof("steps results: %+v", failure.step) if len(failure.step.Results) > 0 { if r, ok := failure.step.Results[0].(float64); ok && r == resInfraFailure { // TODO: Create a trooper alert about this. log.Errorf("INFRA FAILURE: %+v", failure) } } continue // The actual breaking step will appear later. } // Check the gatekeeper configs to see if this is ignorable. if a.excludeFailure(failure.masterName, failure.builderName, failure.step.Name) { continue } // Gets the named revision number from gnumbd metadata. getCommitPos := func(b messages.Build, name string) (string, bool) { for _, p := range b.Properties { if p[0] == name { s, ok := p[1].(string) return s, ok } } return "", false } scannedFailures = append(scannedFailures, failure) go func(f stepFailure) { alr := messages.Alert{ Title: fmt.Sprintf("Builder step failure: %s.%s", f.masterName, f.builderName), Time: messages.EpochTime(a.Now().Unix()), Type: "buildfailure", } regRanges := []messages.RegressionRange{} revisionsByRepo := map[string][]string{} // Get gnumbd sequence numbers for whatever this build pulled in. chromiumPos, ok := getCommitPos(f.build, "got_revision_cp") if ok { regRanges = append(regRanges, messages.RegressionRange{ Repo: "chromium", Positions: []string{chromiumPos}, }) } blinkPos, ok := getCommitPos(f.build, "got_webkit_revision_cp") if ok { regRanges = append(regRanges, messages.RegressionRange{ Repo: "blink", Positions: []string{blinkPos}, }) } v8Pos, ok := getCommitPos(f.build, "got_v8_revision_cp") if ok { regRanges = append(regRanges, messages.RegressionRange{ Repo: "v8", Positions: []string{v8Pos}, }) } naclPos, ok := getCommitPos(f.build, "got_nacl_revision_cp") if ok { regRanges = append(regRanges, messages.RegressionRange{ Repo: "nacl", Positions: []string{naclPos}, }) } for _, change := range f.build.SourceStamp.Changes { revisionsByRepo[change.Repository] = append(revisionsByRepo[change.Repository], change.Revision) // change.Revision is *not* always a git hash. Sometimes it is a position from gnumbd. // change.Revision is git hash or gnumbd depending on what exactly? Not obvious at this time. // A potential problem here is when multiple repos have overlapping gnumbd ranges. a.revisionSummaries[change.Revision] = messages.RevisionSummary{ GitHash: change.Revision, Link: change.Revlink, Description: trunc(change.Comments), Author: change.Who, When: change.When, } } for repo, revisions := range revisionsByRepo { regRanges = append(regRanges, messages.RegressionRange{ Repo: repo, Revisions: revisions, }) } // If the builder has been failing on the same step for multiple builds in a row, // we should have only one alert but indicate the range of builds affected. // These are set in FirstFailure and LastFailure. bf := messages.BuildFailure{ // FIXME: group builders? Builders: []messages.AlertedBuilder{ { Name: f.builderName, URL: client.BuilderURL(f.masterName, f.builderName), StartTime: f.build.CreatedTimestamp, FirstFailure: f.build.Number, LatestFailure: f.build.Number, }, }, TreeCloser: a.wouldCloseTree(f.masterName, f.builderName, f.step.Name), RegressionRanges: regRanges, } reasons := a.reasonsForFailure(f) for _, r := range reasons { bf.Reasons = append(bf.Reasons, messages.Reason{ TestName: r, Step: f.step.Name, URL: f.URL(), }) } alr.Key = alertKey(f.masterName, f.builderName, f.step.Name) if len(bf.Reasons) == 0 { log.Warningf("No reasons for step failure: %s", alr.Key) bf.Reasons = append(bf.Reasons, messages.Reason{ Step: f.step.Name, URL: f.URL(), }) } alr.Extension = bf rs <- res{ f: f, a: &alr, err: nil, } }(failure) } for range scannedFailures { r := <-rs if r.a != nil { ret = append(ret, *r.a) } } return ret, nil }
// stepFailureAlerts returns alerts generated from step failures. It applies filtering // logic specified in the gatekeeper config to ignore some failures. func (a *Analyzer) stepFailureAlerts(failures []stepFailure) ([]messages.Alert, error) { ret := []messages.Alert{} type res struct { f stepFailure a *messages.Alert err error } // Might not need full capacity buffer, since some failures are ignored below. rs := make(chan res, len(failures)) scannedFailures := []stepFailure{} for _, failure := range failures { // goroutine/channel because the reasonsForFailure call potentially // blocks on IO. if failure.step.Name == "steps" { // check results to see if it's an array of [4] // That's a purple failure, which should go to infra/trooper. log.Infof("steps results: %+v", failure.step) if len(failure.step.Results) > 0 { if r, ok := failure.step.Results[0].(float64); ok && r == resInfraFailure { // TODO: Create a trooper alert about this. log.Errorf("INFRA FAILURE: %+v", failure) } } continue // The actual breaking step will appear later. } // Check the gatekeeper configs to see if this is ignorable. if a.excludeFailure(failure.masterName, failure.builderName, failure.step.Name) { continue } scannedFailures = append(scannedFailures, failure) go func(f stepFailure) { alr := messages.Alert{ Title: fmt.Sprintf("Builder step failure: %s.%s", f.masterName, f.builderName), Time: messages.EpochTime(a.now().Unix()), Type: "buildfailure", } regRanges := []messages.RegressionRange{} revsByRepo := map[string][]string{} for _, change := range f.build.SourceStamp.Changes { // check change.Comments for text like // "Cr-Commit-Position: refs/heads/master@{#330158}" to pick out revs from git commits. revsByRepo[change.Repository] = append(revsByRepo[change.Repository], change.Revision) } for repo, revs := range revsByRepo { regRanges = append(regRanges, messages.RegressionRange{ Repo: repo, Revisions: revs, }) } // If the builder has been failing on the same step for multiple builds in a row, // we should have only one alert but indicate the range of builds affected. // These are set in FirstFailure and LastFailure. bf := messages.BuildFailure{ // FIXME: group builders? Builders: []messages.AlertedBuilder{ { Name: f.builderName, URL: client.BuilderURL(f.masterName, f.builderName), StartTime: f.build.CreatedTimestamp, FirstFailure: f.build.Number, LatestFailure: f.build.Number, }, }, TreeCloser: a.wouldCloseTree(f.masterName, f.builderName, f.step.Name), RegressionRanges: regRanges, } reasons := a.reasonsForFailure(f) for _, r := range reasons { bf.Reasons = append(bf.Reasons, messages.Reason{ TestName: r, Step: f.step.Name, URL: f.URL(), }) } alr.Key = alertKey(f.masterName, f.builderName, f.step.Name) if len(bf.Reasons) == 0 { log.Warningf("No reasons for step failure: %s", alr.Key) bf.Reasons = append(bf.Reasons, messages.Reason{ Step: f.step.Name, URL: f.URL(), }) } alr.Extension = bf rs <- res{ f: f, a: &alr, err: nil, } }(failure) } for range scannedFailures { r := <-rs if r.a != nil { ret = append(ret, *r.a) } } return ret, nil }