Exemple #1
0
// TODO: also check the build slaves to see if there are alerts for currently running builds that
// haven't shown up in CBE yet.
func (a *Analyzer) builderAlerts(masterName string, builderName string, b *messages.Builder) ([]messages.Alert, []error) {
	if len(b.CachedBuilds) == 0 {
		// TODO: Make an alert for this?
		return nil, []error{errNoRecentBuilds}
	}

	recentBuildIDs := b.CachedBuilds
	// Should be a *reverse* sort.
	sort.Sort(buildNums(recentBuildIDs))
	if len(recentBuildIDs) > a.MaxRecentBuilds {
		recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds]
	}

	alerts, errs := []messages.Alert{}, []error{}

	lastBuild, lastCompletedBuild, err := a.lastBuilds(masterName, builderName, recentBuildIDs)
	if err != nil {
		errs = append(errs, err)
		return nil, errs
	}

	// Examining only the latest build is probably suboptimal since if it's still in progress it might
	// not have hit a step that is going to fail and has failed repeatedly for the last few builds.
	// AKA "Reliable failures".  TODO: Identify "Reliable failures"
	lastStep, lastUpdated, err := a.latestBuildStep(lastBuild)
	if err != nil {
		errs = append(errs, fmt.Errorf("Couldn't get latest build step for %s.%s: %v", masterName, builderName, err))
		return alerts, errs
	}
	elapsed := a.Now().Sub(lastUpdated.Time())
	links := []messages.Link{
		{"Builder", client.BuilderURL(masterName, builderName)},
		{"Last build", client.BuildURL(masterName, builderName, lastBuild.Number)},
		{"Last build step", client.StepURL(masterName, builderName, lastStep, lastBuild.Number)},
	}

	switch b.State {
	case messages.StateBuilding:
		if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun {
			alerts = append(alerts, messages.Alert{
				Key:      fmt.Sprintf("%s.%s.hung", masterName, builderName),
				Title:    fmt.Sprintf("%s.%s is hung in step %s.", masterName, builderName, lastStep),
				Body:     fmt.Sprintf("%s.%s has been building for %v (last step update %s), past the alerting threshold of %v", masterName, builderName, elapsed, lastUpdated.Time(), a.HungBuilderThresh),
				Severity: hungBuilderSev,
				Time:     messages.TimeToEpochTime(a.Now()),
				Links:    links,
			})
			// Note, just because it's building doesn't mean it's in a good state. If the last N builds
			// all failed (for some large N) then this might still be alertable.
		}
	case messages.StateOffline:
		if elapsed > a.OfflineBuilderThresh {
			alerts = append(alerts, messages.Alert{
				Key:      fmt.Sprintf("%s.%s.offline", masterName, builderName),
				Title:    fmt.Sprintf("%s.%s is offline.", masterName, builderName),
				Body:     fmt.Sprintf("%s.%s has been offline for %v (last step update %s %v), past the alerting threshold of %v", masterName, builderName, elapsed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderThresh),
				Severity: offlineBuilderSev,
				Time:     messages.TimeToEpochTime(a.Now()),
				Links:    links,
			})
		}
	case messages.StateIdle:
		if b.PendingBuilds > a.IdleBuilderCountThresh {
			alerts = append(alerts, messages.Alert{
				Key:      fmt.Sprintf("%s.%s.idle", masterName, builderName),
				Title:    fmt.Sprintf("%s.%s is idle with too many pending builds.", masterName, builderName),
				Body:     fmt.Sprintf("%s.%s is idle with %d pending builds, past the alerting threshold of %d", masterName, builderName, b.PendingBuilds, a.IdleBuilderCountThresh),
				Severity: idleBuilderSev,
				Time:     messages.TimeToEpochTime(a.Now()),
				Links:    links,
			})
		}
	default:
		log.Errorf("Unknown %s.%s builder state: %s", masterName, builderName, b.State)
	}

	// Check for alerts on the most recent complete build
	log.Infof("Checking %d most recent builds for alertable step failures: %s/%s", len(recentBuildIDs), masterName, builderName)
	as, es := a.builderStepAlerts(masterName, builderName, []int64{lastCompletedBuild.Number})

	if len(as) > 0 {
		mostRecentComplete := 0
		for i, id := range recentBuildIDs {
			if id == lastCompletedBuild.Number {
				mostRecentComplete = i
			}
		}
		as, es = a.builderStepAlerts(masterName, builderName, recentBuildIDs[mostRecentComplete:])
		alerts = append(alerts, as...)
		errs = append(errs, es...)
	}

	return alerts, errs
}
Exemple #2
0
// stepFailureAlerts returns alerts generated from step failures. It applies filtering
// logic specified in the gatekeeper config to ignore some failures.
func (a *Analyzer) stepFailureAlerts(failures []stepFailure) ([]messages.Alert, error) {
	ret := []messages.Alert{}
	type res struct {
		f   stepFailure
		a   *messages.Alert
		err error
	}

	// Might not need full capacity buffer, since some failures are ignored below.
	rs := make(chan res, len(failures))

	scannedFailures := []stepFailure{}
	for _, failure := range failures {
		// goroutine/channel because the reasonsForFailure call potentially
		// blocks on IO.
		if failure.step.Name == "steps" {
			// check results to see if it's an array of [4]
			// That's a purple failure, which should go to infra/trooper.
			log.Infof("steps results: %+v", failure.step)
			if len(failure.step.Results) > 0 {
				if r, ok := failure.step.Results[0].(float64); ok && r == resInfraFailure {
					// TODO: Create a trooper alert about this.
					log.Errorf("INFRA FAILURE: %+v", failure)
				}
			}
			continue
			// The actual breaking step will appear later.
		}

		// Check the gatekeeper configs to see if this is ignorable.
		if a.excludeFailure(failure.masterName, failure.builderName, failure.step.Name) {
			continue
		}

		// Gets the named revision number from gnumbd metadata.
		getCommitPos := func(b messages.Build, name string) (string, bool) {
			for _, p := range b.Properties {
				if p[0] == name {
					s, ok := p[1].(string)
					return s, ok
				}
			}
			return "", false
		}

		scannedFailures = append(scannedFailures, failure)
		go func(f stepFailure) {
			alr := messages.Alert{
				Title: fmt.Sprintf("Builder step failure: %s.%s", f.masterName, f.builderName),
				Time:  messages.EpochTime(a.Now().Unix()),
				Type:  "buildfailure",
			}

			regRanges := []messages.RegressionRange{}
			revisionsByRepo := map[string][]string{}

			// Get gnumbd sequence numbers for whatever this build pulled in.
			chromiumPos, ok := getCommitPos(f.build, "got_revision_cp")
			if ok {
				regRanges = append(regRanges, messages.RegressionRange{
					Repo:      "chromium",
					Positions: []string{chromiumPos},
				})
			}

			blinkPos, ok := getCommitPos(f.build, "got_webkit_revision_cp")
			if ok {
				regRanges = append(regRanges, messages.RegressionRange{
					Repo:      "blink",
					Positions: []string{blinkPos},
				})
			}

			v8Pos, ok := getCommitPos(f.build, "got_v8_revision_cp")
			if ok {
				regRanges = append(regRanges, messages.RegressionRange{
					Repo:      "v8",
					Positions: []string{v8Pos},
				})
			}

			naclPos, ok := getCommitPos(f.build, "got_nacl_revision_cp")
			if ok {
				regRanges = append(regRanges, messages.RegressionRange{
					Repo:      "nacl",
					Positions: []string{naclPos},
				})
			}

			for _, change := range f.build.SourceStamp.Changes {
				revisionsByRepo[change.Repository] = append(revisionsByRepo[change.Repository], change.Revision)
				// change.Revision is *not* always a git hash. Sometimes it is a position from gnumbd.
				// change.Revision is git hash or gnumbd depending on what exactly? Not obvious at this time.
				// A potential problem here is when multiple repos have overlapping gnumbd ranges.
				a.revisionSummaries[change.Revision] = messages.RevisionSummary{
					GitHash:     change.Revision,
					Link:        change.Revlink,
					Description: trunc(change.Comments),
					Author:      change.Who,
					When:        change.When,
				}
			}

			for repo, revisions := range revisionsByRepo {
				regRanges = append(regRanges, messages.RegressionRange{
					Repo:      repo,
					Revisions: revisions,
				})
			}

			// If the builder has been failing on the same step for multiple builds in a row,
			// we should have only one alert but indicate the range of builds affected.
			// These are set in FirstFailure and LastFailure.
			bf := messages.BuildFailure{
				// FIXME: group builders?
				Builders: []messages.AlertedBuilder{
					{
						Name:          f.builderName,
						URL:           client.BuilderURL(f.masterName, f.builderName),
						StartTime:     f.build.CreatedTimestamp,
						FirstFailure:  f.build.Number,
						LatestFailure: f.build.Number,
					},
				},
				TreeCloser:       a.wouldCloseTree(f.masterName, f.builderName, f.step.Name),
				RegressionRanges: regRanges,
			}

			reasons := a.reasonsForFailure(f)
			for _, r := range reasons {
				bf.Reasons = append(bf.Reasons, messages.Reason{
					TestName: r,
					Step:     f.step.Name,
					URL:      f.URL(),
				})
			}

			alr.Key = alertKey(f.masterName, f.builderName, f.step.Name)
			if len(bf.Reasons) == 0 {
				log.Warningf("No reasons for step failure: %s", alr.Key)
				bf.Reasons = append(bf.Reasons, messages.Reason{
					Step: f.step.Name,
					URL:  f.URL(),
				})
			}

			alr.Extension = bf

			rs <- res{
				f:   f,
				a:   &alr,
				err: nil,
			}
		}(failure)
	}

	for range scannedFailures {
		r := <-rs
		if r.a != nil {
			ret = append(ret, *r.a)
		}
	}

	return ret, nil
}
Exemple #3
0
// stepFailureAlerts returns alerts generated from step failures. It applies filtering
// logic specified in the gatekeeper config to ignore some failures.
func (a *Analyzer) stepFailureAlerts(failures []stepFailure) ([]messages.Alert, error) {
	ret := []messages.Alert{}
	type res struct {
		f   stepFailure
		a   *messages.Alert
		err error
	}

	// Might not need full capacity buffer, since some failures are ignored below.
	rs := make(chan res, len(failures))

	scannedFailures := []stepFailure{}
	for _, failure := range failures {
		// goroutine/channel because the reasonsForFailure call potentially
		// blocks on IO.
		if failure.step.Name == "steps" {
			// check results to see if it's an array of [4]
			// That's a purple failure, which should go to infra/trooper.
			log.Infof("steps results: %+v", failure.step)
			if len(failure.step.Results) > 0 {
				if r, ok := failure.step.Results[0].(float64); ok && r == resInfraFailure {
					// TODO: Create a trooper alert about this.
					log.Errorf("INFRA FAILURE: %+v", failure)
				}
			}
			continue
			// The actual breaking step will appear later.
		}

		// Check the gatekeeper configs to see if this is ignorable.
		if a.excludeFailure(failure.masterName, failure.builderName, failure.step.Name) {
			continue
		}

		scannedFailures = append(scannedFailures, failure)
		go func(f stepFailure) {
			alr := messages.Alert{
				Title: fmt.Sprintf("Builder step failure: %s.%s", f.masterName, f.builderName),
				Time:  messages.EpochTime(a.now().Unix()),
				Type:  "buildfailure",
			}

			regRanges := []messages.RegressionRange{}
			revsByRepo := map[string][]string{}

			for _, change := range f.build.SourceStamp.Changes {
				// check change.Comments for text like
				// "Cr-Commit-Position: refs/heads/master@{#330158}" to pick out revs from git commits.
				revsByRepo[change.Repository] = append(revsByRepo[change.Repository], change.Revision)
			}
			for repo, revs := range revsByRepo {
				regRanges = append(regRanges, messages.RegressionRange{
					Repo:      repo,
					Revisions: revs,
				})
			}

			// If the builder has been failing on the same step for multiple builds in a row,
			// we should have only one alert but indicate the range of builds affected.
			// These are set in FirstFailure and LastFailure.
			bf := messages.BuildFailure{
				// FIXME: group builders?
				Builders: []messages.AlertedBuilder{
					{
						Name:          f.builderName,
						URL:           client.BuilderURL(f.masterName, f.builderName),
						StartTime:     f.build.CreatedTimestamp,
						FirstFailure:  f.build.Number,
						LatestFailure: f.build.Number,
					},
				},
				TreeCloser:       a.wouldCloseTree(f.masterName, f.builderName, f.step.Name),
				RegressionRanges: regRanges,
			}

			reasons := a.reasonsForFailure(f)
			for _, r := range reasons {
				bf.Reasons = append(bf.Reasons, messages.Reason{
					TestName: r,
					Step:     f.step.Name,
					URL:      f.URL(),
				})
			}

			alr.Key = alertKey(f.masterName, f.builderName, f.step.Name)
			if len(bf.Reasons) == 0 {
				log.Warningf("No reasons for step failure: %s", alr.Key)
				bf.Reasons = append(bf.Reasons, messages.Reason{
					Step: f.step.Name,
					URL:  f.URL(),
				})
			}

			alr.Extension = bf

			rs <- res{
				f:   f,
				a:   &alr,
				err: nil,
			}
		}(failure)
	}

	for range scannedFailures {
		r := <-rs
		if r.a != nil {
			ret = append(ret, *r.a)
		}
	}

	return ret, nil
}