Example #1
0
// MasterAlerts returns alerts generated from the master.
func (a *Analyzer) MasterAlerts(master string, be *messages.BuildExtract) []messages.Alert {
	ret := []messages.Alert{}

	// Copied logic from builder_messages.
	// No created_timestamp should be a warning sign, no?
	if be.CreatedTimestamp == messages.EpochTime(0) {
		return ret
	}

	elapsed := a.Now().Sub(be.CreatedTimestamp.Time())
	if elapsed > a.StaleMasterThreshold {
		ret = append(ret, messages.Alert{
			Key:       fmt.Sprintf("stale master: %v", master),
			Title:     fmt.Sprintf("Stale %s master data", master),
			Body:      fmt.Sprintf("%s elapsed since last update.", elapsed),
			StartTime: messages.TimeToEpochTime(be.CreatedTimestamp.Time()),
			Severity:  staleMasterSev,
			Time:      messages.TimeToEpochTime(a.Now()),
			Links:     []messages.Link{{"Master", client.MasterURL(master)}},
			// No type or extension for now.
		})
	}
	if elapsed < 0 {
		// Add this to the alerts returned, rather than just log it?
		log.Errorf("Master %s timestamp is newer than current time (%s): %s old.", master, a.Now(), elapsed)
	}

	return ret
}
Example #2
0
// latestBuildStep returns the latest build step name and update time, and an error
// if there were any errors.
func (a *Analyzer) latestBuildStep(b *messages.Build) (lastStep string, lastUpdate messages.EpochTime, err error) {
	if len(b.Steps) == 0 {
		return "", messages.TimeToEpochTime(a.Now()), errNoBuildSteps
	}
	if len(b.Times) > 1 && b.Times[1] != 0 {
		return StepCompletedRun, b.Times[1], nil
	}

	for _, step := range b.Steps {
		if len(step.Times) > 1 && step.Times[1] > lastUpdate {
			// Step is done.
			lastUpdate = step.Times[1]
			lastStep = step.Name
		} else if len(step.Times) > 0 && step.Times[0] > lastUpdate {
			// Step has started.
			lastUpdate = step.Times[0]
			lastStep = step.Name
		}
	}
	return
}
Example #3
0
func TestLatestBuildStep(t *testing.T) {
	tests := []struct {
		name       string
		b          messages.Build
		wantStep   string
		wantUpdate messages.EpochTime
		wantErr    error
	}{
		{
			name:    "blank",
			wantErr: errNoBuildSteps,
		},
		{
			name: "done time is latest",
			b: messages.Build{
				Steps: []messages.Step{
					{
						Name: "done step",
						Times: []messages.EpochTime{
							messages.TimeToEpochTime(time.Unix(6, 0)),
							messages.TimeToEpochTime(time.Unix(42, 0)),
						},
					},
				},
			},
			wantStep:   "done step",
			wantUpdate: messages.TimeToEpochTime(time.Unix(42, 0)),
		},
		{
			name: "started time is latest",
			b: messages.Build{
				Steps: []messages.Step{
					{
						Name: "start step",
						Times: []messages.EpochTime{
							messages.TimeToEpochTime(time.Unix(42, 0)),
							messages.TimeToEpochTime(time.Unix(0, 0)),
						},
					},
				},
			},
			wantStep:   "start step",
			wantUpdate: messages.TimeToEpochTime(time.Unix(42, 0)),
		},
		{
			name: "started time is latest, multiple steps",
			b: messages.Build{
				Steps: []messages.Step{
					{
						Name: "start step",
						Times: []messages.EpochTime{
							messages.TimeToEpochTime(time.Unix(6, 0)),
							messages.TimeToEpochTime(time.Unix(7, 0)),
						},
					},
					{
						Name: "second step",
						Times: []messages.EpochTime{
							messages.TimeToEpochTime(time.Unix(42, 0)),
							messages.TimeToEpochTime(time.Unix(0, 0)),
						},
					},
				},
			},
			wantStep:   "second step",
			wantUpdate: messages.TimeToEpochTime(time.Unix(42, 0)),
		},
		{
			name: "done time is latest, multiple steps",
			b: messages.Build{
				Steps: []messages.Step{
					{
						Name: "start step",
						Times: []messages.EpochTime{
							messages.TimeToEpochTime(time.Unix(0, 0)),
							messages.TimeToEpochTime(time.Unix(6, 0)),
						},
					},
					{
						Name: "second step",
						Times: []messages.EpochTime{
							messages.TimeToEpochTime(time.Unix(7, 0)),
							messages.TimeToEpochTime(time.Unix(42, 0)),
						},
					},
				},
			},
			wantStep:   "second step",
			wantUpdate: messages.TimeToEpochTime(time.Unix(42, 0)),
		},
		{
			name: "build is done",
			b: messages.Build{
				Times: []messages.EpochTime{
					messages.TimeToEpochTime(time.Unix(0, 0)),
					messages.TimeToEpochTime(time.Unix(42, 0)),
				},
				Steps: []messages.Step{
					{
						Name: "start step",
						Times: []messages.EpochTime{
							messages.TimeToEpochTime(time.Unix(0, 0)),
							messages.TimeToEpochTime(time.Unix(0, 0)),
						},
					},
					{
						Name: "second step",
						Times: []messages.EpochTime{
							messages.TimeToEpochTime(time.Unix(6, 0)),
							messages.TimeToEpochTime(time.Unix(7, 0)),
						},
					},
				},
			},
			wantStep:   StepCompletedRun,
			wantUpdate: messages.TimeToEpochTime(time.Unix(42, 0)),
		},
	}

	a := New(&mockReader{}, 0, 10)
	a.now = fakeNow(time.Unix(0, 0))
	for _, test := range tests {
		gotStep, gotUpdate, gotErr := a.latestBuildStep(&test.b)
		if gotStep != test.wantStep {
			t.Errorf("%s failed. Got %q, want %q.", test.name, gotStep, test.wantStep)
		}
		if gotUpdate != test.wantUpdate {
			t.Errorf("%s failed. Got %v, want %v.", test.name, gotUpdate, test.wantUpdate)
		}
		if gotErr != test.wantErr {
			t.Errorf("%s failed. Got %s, want %s.", test.name, gotErr, test.wantErr)
		}
	}
}
Example #4
0
func TestMasterAlerts(t *testing.T) {
	tests := []struct {
		name   string
		master string
		be     messages.BuildExtract
		t      time.Time
		want   []messages.Alert
	}{
		{
			name:   "empty",
			master: "fake-empty",
			want:   []messages.Alert{},
		},
		{
			name:   "Not stale master",
			master: "fake-not-stale",
			be: messages.BuildExtract{
				CreatedTimestamp: messages.EpochTime(100),
			},
			t:    time.Unix(100, 0),
			want: []messages.Alert{},
		},
		{
			name:   "Stale master",
			master: "fake.master",
			be: messages.BuildExtract{
				CreatedTimestamp: messages.EpochTime(100),
			},
			t: time.Unix(100, 0).Add(20 * time.Minute),
			want: []messages.Alert{
				{
					Key:       "stale master: fake.master",
					Title:     "Stale fake.master master data",
					Body:      fmt.Sprintf("%s elapsed since last update.", 20*time.Minute),
					Time:      messages.TimeToEpochTime(time.Unix(100, 0).Add(20 * time.Minute)),
					Links:     []messages.Link{{"Master", client.MasterURL("fake.master")}},
					StartTime: messages.EpochTime(100),
				},
			},
		},
		{
			name:   "Future master",
			master: "fake.master",
			be: messages.BuildExtract{
				CreatedTimestamp: messages.EpochTime(110),
			},
			t:    time.Unix(100, 0),
			want: []messages.Alert{},
		},
	}

	a := New(&mockReader{}, 0, 10)

	for _, test := range tests {
		a.now = fakeNow(test.t)
		got := a.MasterAlerts(test.master, &test.be)
		if !reflect.DeepEqual(got, test.want) {
			t.Errorf("%s failed. Got %+v, want: %+v", test.name, got, test.want)
		}
	}
}
Example #5
0
func TestLittleBBuilderAlerts(t *testing.T) {
	tests := []struct {
		name       string
		master     string
		builder    string
		b          messages.Builder
		builds     map[string]*messages.Build
		time       time.Time
		wantAlerts []messages.Alert
		wantErrs   []error
	}{
		{
			name:     "empty",
			wantErrs: []error{errNoRecentBuilds},
		},
		{
			name:    "builders ok",
			master:  "fake.master",
			builder: "fake.builder",
			builds: map[string]*messages.Build{
				"fake.master/fake.builder/0": {
					Steps: []messages.Step{
						{
							Name: "fake_step",
							Times: []messages.EpochTime{
								messages.TimeToEpochTime(time.Unix(10, 0)),
								messages.TimeToEpochTime(time.Unix(100, 0)),
							},
						},
					},
					Times: []messages.EpochTime{
						messages.TimeToEpochTime(time.Unix(10, 0)),
						messages.TimeToEpochTime(time.Unix(100, 0)),
					},
				},
			},
			b: messages.Builder{
				BuilderName:  "fake.builder",
				CachedBuilds: []int64{0},
			},
			wantAlerts: []messages.Alert{},
			wantErrs:   []error{},
		},
		{
			name:    "builder building for too long",
			master:  "fake.master",
			builder: "fake.builder",
			builds: map[string]*messages.Build{
				"fake.master/fake.builder/0": {
					Number: 0,
					Times: []messages.EpochTime{
						messages.TimeToEpochTime(time.Unix(10, 0)),
						messages.TimeToEpochTime(time.Unix(100, 0)),
					},
					Steps: []messages.Step{
						{
							Name: "fake_step",
							Times: []messages.EpochTime{
								messages.TimeToEpochTime(time.Unix(10, 0)),
								messages.TimeToEpochTime(time.Unix(100, 0)),
							},
						},
					},
				},
				"fake.master/fake.builder/1": {
					Number: 1,
					Steps: []messages.Step{
						{
							Name: "fake_step",
							Times: []messages.EpochTime{
								messages.TimeToEpochTime(time.Unix(10, 0)),
								messages.TimeToEpochTime(time.Unix(0, 0)),
							},
						},
					},
				},
			},
			b: messages.Builder{
				State:        messages.StateBuilding,
				BuilderName:  "fake.builder",
				CachedBuilds: []int64{0, 1},
			},
			time: time.Unix(0, 0).Add(4 * time.Hour),
			wantAlerts: []messages.Alert{
				{
					Key:      "fake.master.fake.builder.hung",
					Title:    "fake.master.fake.builder is hung in step fake_step.",
					Time:     messages.TimeToEpochTime(time.Unix(0, 0).Add(4 * time.Hour)),
					Body:     "fake.master.fake.builder has been building for 3h59m50s (last step update 1970-01-01 00:00:10 +0000 UTC), past the alerting threshold of 3h0m0s",
					Severity: 1,
					Links: []messages.Link{
						{Title: "Builder", Href: "https://build.chromium.org/p/fake.master/builders/fake.builder"},
						{Title: "Last build", Href: "https://build.chromium.org/p/fake.master/builders/fake.builder/builds/1"},
						{Title: "Last build step", Href: "https://build.chromium.org/p/fake.master/builders/fake.builder/builds/1/steps/fake_step"},
					},
				},
			},
			wantErrs: []error{},
		},
	}

	a := New(nil, 0, 10)

	for _, test := range tests {
		a.now = fakeNow(test.time)
		a.Reader = mockReader{
			builds: test.builds,
		}
		gotAlerts, gotErrs := a.builderAlerts(test.master, test.builder, &test.b)
		if !reflect.DeepEqual(gotAlerts, test.wantAlerts) {
			t.Errorf("%s failed. Got:\n%+v, want:\n%+v", test.name, gotAlerts, test.wantAlerts)
		}
		if !reflect.DeepEqual(gotErrs, test.wantErrs) {
			t.Errorf("%s failed. Got %+v, want: %+v", test.name, gotErrs, test.wantErrs)
		}
	}
}
Example #6
0
func main() {
	start := time.Now()
	flag.Parse()

	duration, err := time.ParseDuration(*durationStr)
	if err != nil {
		log.Errorf("Error parsing duration: %v", err)
		os.Exit(1)
	}

	cycle, err := time.ParseDuration(*cycleStr)
	if err != nil {
		log.Errorf("Error parsing cycle: %v", err)
		os.Exit(1)
	}

	err = readJSONFile(*gatekeeperJSON, &gk)
	if err != nil {
		log.Errorf("Error reading gatekeeper json: %v", err)
		os.Exit(1)
	}

	err = readJSONFile(*gatekeeperTreesJSON, &gkt)
	if err != nil {
		log.Errorf("Error reading gatekeeper json: %v", err)
		os.Exit(1)
	}

	if *snapshot != "" && *replay != "" {
		log.Errorf("Cannot use snapshot and replay flags at the same time.")
		os.Exit(1)
	}

	r := client.NewReader()
	if *snapshot != "" {
		r = client.NewSnapshot(r, *snapshot)
	}

	if *replay != "" {
		r = client.NewReplay(*replay)
	}

	a := analyzer.New(r, 5, 100)

	for masterURL, masterCfgs := range gk.Masters {
		if len(masterCfgs) != 1 {
			log.Errorf("Multiple configs for master: %s", masterURL)
		}
		masterName := masterFromURL(masterURL)
		a.MasterCfgs[masterName] = masterCfgs[0]
	}

	a.MasterOnly = *masterOnly
	a.BuilderOnly = *builderOnly
	a.BuildOnly = *buildOnly

	trees := map[string]bool{}
	if *treesOnly != "" {
		for _, treeOnly := range strings.Split(*treesOnly, ",") {
			trees[treeOnly] = true
		}
	} else {
		for treeName := range gkt {
			trees[treeName] = true
		}
	}

	for tree := range trees {
		if _, ok := gkt[tree]; !ok {
			log.Errorf("Unrecognized tree name: %s", tree)
			os.Exit(1)
		}
	}

	// This is the polling/analysis/alert posting function, which will run in a loop until
	// a timeout or max errors is reached.
	f := func(ctx context.Context) error {
		done := make(chan interface{})
		errs := make(chan error)
		for treeName := range trees {
			go func(tree string) {
				log.Infof("Checking tree: %s", tree)
				masterNames := []string{}
				t := gkt[tree]
				for _, url := range t.Masters {
					masterNames = append(masterNames, masterFromURL(url))
				}

				// TODO(seanmccullough): Plumb ctx through the rest of these calls.
				bes := fetchBuildExtracts(a.Reader, masterNames)
				log.Infof("Build Extracts read: %d", len(bes))

				alerts := &messages.Alerts{}
				for masterName, be := range bes {
					alerts.Alerts = append(alerts.Alerts, analyzeBuildExtract(a, masterName, be)...)
				}
				alerts.Timestamp = messages.TimeToEpochTime(time.Now())

				if *alertsBaseURL == "" {
					log.Infof("No data_url provided. Writing to %s-alerts.json", tree)

					abytes, err := json.MarshalIndent(alerts, "", "\t")
					if err != nil {
						log.Errorf("Couldn't marshal alerts json: %v", err)
						errs <- err
						return
					}

					if err := ioutil.WriteFile(fmt.Sprintf("%s-alerts.json", tree), abytes, 0644); err != nil {
						log.Errorf("Couldn't write to alerts.json: %v", err)
						errs <- err
						return
					}
				} else {
					alertsURL := fmt.Sprintf("%s/%s", *alertsBaseURL, tree)
					w := client.NewWriter(alertsURL)
					log.Infof("Posting alerts to %s", alertsURL)
					err := w.PostAlerts(alerts)
					if err != nil {
						log.Errorf("Couldn't post alerts: %v", err)
						errs <- err
						return
					}
				}

				log.Infof("Filtered failures: %v", filteredFailures)
				a.Reader.DumpStats()
				log.Infof("Elapsed time: %v", time.Since(start))
				done <- nil
			}(treeName)
		}

		for range trees {
			select {
			case err := <-errs:
				return err
			case <-done:
			}
		}
		return nil
	}

	ctx, cancel := context.WithTimeout(context.Background(), duration)
	logging.Set(ctx, log)
	defer cancel()
	loopResults := looper.Run(ctx, f, cycle, *maxErrs, clock.GetSystemClock())

	if !loopResults.Success {
		log.Errorf("Failed to run loop, %v errors", loopResults.Errs)
		os.Exit(1)
	}
}
Example #7
0
// TODO: also check the build slaves to see if there are alerts for currently running builds that
// haven't shown up in CBE yet.
func (a *Analyzer) builderAlerts(masterName string, builderName string, b *messages.Builder) ([]messages.Alert, []error) {
	if len(b.CachedBuilds) == 0 {
		// TODO: Make an alert for this?
		return nil, []error{errNoRecentBuilds}
	}

	recentBuildIDs := b.CachedBuilds
	// Should be a *reverse* sort.
	sort.Sort(buildNums(recentBuildIDs))
	if len(recentBuildIDs) > a.MaxRecentBuilds {
		recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds]
	}

	alerts, errs := []messages.Alert{}, []error{}

	lastBuild, lastCompletedBuild, err := a.lastBuilds(masterName, builderName, recentBuildIDs)
	if err != nil {
		errs = append(errs, err)
		return nil, errs
	}

	// Examining only the latest build is probably suboptimal since if it's still in progress it might
	// not have hit a step that is going to fail and has failed repeatedly for the last few builds.
	// AKA "Reliable failures".  TODO: Identify "Reliable failures"
	lastStep, lastUpdated, err := a.latestBuildStep(lastBuild)
	if err != nil {
		errs = append(errs, fmt.Errorf("Couldn't get latest build step for %s.%s: %v", masterName, builderName, err))
		return alerts, errs
	}
	elapsed := a.Now().Sub(lastUpdated.Time())
	links := []messages.Link{
		{"Builder", client.BuilderURL(masterName, builderName)},
		{"Last build", client.BuildURL(masterName, builderName, lastBuild.Number)},
		{"Last build step", client.StepURL(masterName, builderName, lastStep, lastBuild.Number)},
	}

	switch b.State {
	case messages.StateBuilding:
		if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun {
			alerts = append(alerts, messages.Alert{
				Key:      fmt.Sprintf("%s.%s.hung", masterName, builderName),
				Title:    fmt.Sprintf("%s.%s is hung in step %s.", masterName, builderName, lastStep),
				Body:     fmt.Sprintf("%s.%s has been building for %v (last step update %s), past the alerting threshold of %v", masterName, builderName, elapsed, lastUpdated.Time(), a.HungBuilderThresh),
				Severity: hungBuilderSev,
				Time:     messages.TimeToEpochTime(a.Now()),
				Links:    links,
			})
			// Note, just because it's building doesn't mean it's in a good state. If the last N builds
			// all failed (for some large N) then this might still be alertable.
		}
	case messages.StateOffline:
		if elapsed > a.OfflineBuilderThresh {
			alerts = append(alerts, messages.Alert{
				Key:      fmt.Sprintf("%s.%s.offline", masterName, builderName),
				Title:    fmt.Sprintf("%s.%s is offline.", masterName, builderName),
				Body:     fmt.Sprintf("%s.%s has been offline for %v (last step update %s %v), past the alerting threshold of %v", masterName, builderName, elapsed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderThresh),
				Severity: offlineBuilderSev,
				Time:     messages.TimeToEpochTime(a.Now()),
				Links:    links,
			})
		}
	case messages.StateIdle:
		if b.PendingBuilds > a.IdleBuilderCountThresh {
			alerts = append(alerts, messages.Alert{
				Key:      fmt.Sprintf("%s.%s.idle", masterName, builderName),
				Title:    fmt.Sprintf("%s.%s is idle with too many pending builds.", masterName, builderName),
				Body:     fmt.Sprintf("%s.%s is idle with %d pending builds, past the alerting threshold of %d", masterName, builderName, b.PendingBuilds, a.IdleBuilderCountThresh),
				Severity: idleBuilderSev,
				Time:     messages.TimeToEpochTime(a.Now()),
				Links:    links,
			})
		}
	default:
		log.Errorf("Unknown %s.%s builder state: %s", masterName, builderName, b.State)
	}

	// Check for alerts on the most recent complete build
	log.Infof("Checking %d most recent builds for alertable step failures: %s/%s", len(recentBuildIDs), masterName, builderName)
	as, es := a.builderStepAlerts(masterName, builderName, []int64{lastCompletedBuild.Number})

	if len(as) > 0 {
		mostRecentComplete := 0
		for i, id := range recentBuildIDs {
			if id == lastCompletedBuild.Number {
				mostRecentComplete = i
			}
		}
		as, es = a.builderStepAlerts(masterName, builderName, recentBuildIDs[mostRecentComplete:])
		alerts = append(alerts, as...)
		errs = append(errs, es...)
	}

	return alerts, errs
}