// URL returns a url to builder step failure page. func (f stepFailure) URL() string { return client.StepURL(f.masterName, f.builderName, f.step.Name, f.build.Number) }
// TODO: also check the build slaves to see if there are alerts for currently running builds that // haven't shown up in CBE yet. func (a *Analyzer) builderAlerts(masterName string, builderName string, b *messages.Builder) ([]messages.Alert, []error) { if len(b.CachedBuilds) == 0 { // TODO: Make an alert for this? return nil, []error{errNoRecentBuilds} } recentBuildIDs := b.CachedBuilds // Should be a *reverse* sort. sort.Sort(buildNums(recentBuildIDs)) if len(recentBuildIDs) > a.MaxRecentBuilds { recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds] } alerts, errs := []messages.Alert{}, []error{} lastBuild, lastCompletedBuild, err := a.lastBuilds(masterName, builderName, recentBuildIDs) if err != nil { errs = append(errs, err) return nil, errs } // Examining only the latest build is probably suboptimal since if it's still in progress it might // not have hit a step that is going to fail and has failed repeatedly for the last few builds. // AKA "Reliable failures". TODO: Identify "Reliable failures" lastStep, lastUpdated, err := a.latestBuildStep(lastBuild) if err != nil { errs = append(errs, fmt.Errorf("Couldn't get latest build step for %s.%s: %v", masterName, builderName, err)) return alerts, errs } elapsed := a.Now().Sub(lastUpdated.Time()) links := []messages.Link{ {"Builder", client.BuilderURL(masterName, builderName)}, {"Last build", client.BuildURL(masterName, builderName, lastBuild.Number)}, {"Last build step", client.StepURL(masterName, builderName, lastStep, lastBuild.Number)}, } switch b.State { case messages.StateBuilding: if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun { alerts = append(alerts, messages.Alert{ Key: fmt.Sprintf("%s.%s.hung", masterName, builderName), Title: fmt.Sprintf("%s.%s is hung in step %s.", masterName, builderName, lastStep), Body: fmt.Sprintf("%s.%s has been building for %v (last step update %s), past the alerting threshold of %v", masterName, builderName, elapsed, lastUpdated.Time(), a.HungBuilderThresh), Severity: hungBuilderSev, Time: messages.TimeToEpochTime(a.Now()), Links: links, }) // Note, just because it's building doesn't mean it's in a good state. If the last N builds // all failed (for some large N) then this might still be alertable. } case messages.StateOffline: if elapsed > a.OfflineBuilderThresh { alerts = append(alerts, messages.Alert{ Key: fmt.Sprintf("%s.%s.offline", masterName, builderName), Title: fmt.Sprintf("%s.%s is offline.", masterName, builderName), Body: fmt.Sprintf("%s.%s has been offline for %v (last step update %s %v), past the alerting threshold of %v", masterName, builderName, elapsed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderThresh), Severity: offlineBuilderSev, Time: messages.TimeToEpochTime(a.Now()), Links: links, }) } case messages.StateIdle: if b.PendingBuilds > a.IdleBuilderCountThresh { alerts = append(alerts, messages.Alert{ Key: fmt.Sprintf("%s.%s.idle", masterName, builderName), Title: fmt.Sprintf("%s.%s is idle with too many pending builds.", masterName, builderName), Body: fmt.Sprintf("%s.%s is idle with %d pending builds, past the alerting threshold of %d", masterName, builderName, b.PendingBuilds, a.IdleBuilderCountThresh), Severity: idleBuilderSev, Time: messages.TimeToEpochTime(a.Now()), Links: links, }) } default: log.Errorf("Unknown %s.%s builder state: %s", masterName, builderName, b.State) } // Check for alerts on the most recent complete build log.Infof("Checking %d most recent builds for alertable step failures: %s/%s", len(recentBuildIDs), masterName, builderName) as, es := a.builderStepAlerts(masterName, builderName, []int64{lastCompletedBuild.Number}) if len(as) > 0 { mostRecentComplete := 0 for i, id := range recentBuildIDs { if id == lastCompletedBuild.Number { mostRecentComplete = i } } as, es = a.builderStepAlerts(masterName, builderName, recentBuildIDs[mostRecentComplete:]) alerts = append(alerts, as...) errs = append(errs, es...) } return alerts, errs }