func buildsJsonHandler(w http.ResponseWriter, r *http.Request) { defer timer.New("buildsHandler").Stop() w.Header().Set("Content-Type", "application/json") start, err := getIntParam("start", r) if err != nil { util.ReportError(w, r, err, fmt.Sprintf("Invalid value for parameter \"start\": %v", err)) return } end, err := getIntParam("end", r) if err != nil { util.ReportError(w, r, err, fmt.Sprintf("Invalid value for parameter \"end\": %v", err)) return } var startTime time.Time var endTime time.Time if end == nil { endTime = time.Now() } else { endTime = time.Unix(int64(*end), 0) } if start == nil { startTime = endTime.AddDate(0, 0, -1) } else { startTime = time.Unix(int64(*start), 0) } // Fetch the builds. builds, err := buildbot.GetBuildsFromDateRange(startTime, endTime) if err != nil { util.ReportError(w, r, err, fmt.Sprintf("Failed to load builds: %v", err)) return } // Shrink the builds. // TODO(borenet): Can we share build-shrinking code with the main status // page? // TinyBuildStep is a struct containing a small subset of a BuildStep's fields. type TinyBuildStep struct { Name string Started float64 Finished float64 Results int } // TinyBuild is a struct containing a small subset of a Build's fields. type TinyBuild struct { Builder string BuildSlave string Master string Number int Properties [][]interface{} `json:"properties"` Started float64 Finished float64 Results int Steps []*TinyBuildStep } rv := make([]*TinyBuild, 0, len(builds)) for _, b := range builds { steps := make([]*TinyBuildStep, 0, len(b.Steps)) for _, s := range b.Steps { steps = append(steps, &TinyBuildStep{ Name: s.Name, Started: s.Started, Finished: s.Finished, Results: s.Results, }) } rv = append(rv, &TinyBuild{ Builder: b.Builder, BuildSlave: b.BuildSlave, Master: b.Master, Number: b.Number, Properties: b.Properties, Started: b.Started, Finished: b.Finished, Results: b.Results, Steps: steps, }) } defer timer.New("buildsHandler_encode").Stop() if err := json.NewEncoder(w).Encode(rv); err != nil { glog.Errorf("Failed to write or encode output: %s", err) return } }
func StartAlertRoutines(am *alerting.AlertManager, tickInterval time.Duration, c *influxdb.Client) { emailAction, err := alerting.ParseAction("Email([email protected])") if err != nil { glog.Fatal(err) } actions := []alerting.Action{emailAction} // Disconnected buildslaves. go func() { seriesTmpl := "buildbot.buildslaves.%s.connected" re := regexp.MustCompile("[^A-Za-z0-9]+") for _ = range time.Tick(tickInterval) { glog.Info("Loading buildslave data.") slaves, err := buildbot.GetBuildSlaves() if err != nil { glog.Error(err) continue } for masterName, m := range slaves { for _, s := range m { if util.In(s.Name, BUILDSLAVE_OFFLINE_BLACKLIST) { continue } v := int64(0) if s.Connected { v = int64(1) } metric := fmt.Sprintf(seriesTmpl, re.ReplaceAllString(s.Name, "_")) metrics.GetOrRegisterGauge(metric, metrics.DefaultRegistry).Update(v) if !s.Connected { // This buildslave is offline. Figure out which one it is. if err := am.AddAlert(&alerting.Alert{ Name: fmt.Sprintf("Buildslave %s offline", s.Name), Category: alerting.INFRA_ALERT, Message: fmt.Sprintf(BUILDSLAVE_OFFLINE, s.Name, masterName, s.Name, s.Name, s.Name), Nag: int64(time.Hour), AutoDismiss: int64(2 * tickInterval), Actions: actions, }); err != nil { glog.Error(err) } } } } } }() // AutoRoll failure. go func() { getDepsRollStatus := func() (*autoroller.AutoRollStatus, error) { resp, err := http.Get(autoroll.AUTOROLL_STATUS_URL) if err != nil { return nil, err } defer util.Close(resp.Body) var status autoroller.AutoRollStatus if err := json.NewDecoder(resp.Body).Decode(&status); err != nil { return nil, err } return &status, nil } for _ = range time.Tick(time.Minute) { glog.Infof("Searching for DEPS rolls.") status, err := getDepsRollStatus() if err != nil { util.LogErr(fmt.Errorf("Failed to search for DEPS rolls: %v", err)) continue } activeAlert := am.ActiveAlert(AUTOROLL_ALERT_NAME) if status.LastRoll != nil { if status.LastRoll.Closed { if status.LastRoll.Succeeded() { if activeAlert != 0 { msg := fmt.Sprintf("Subsequent roll succeeded: %s/%d", autoroll.RIETVELD_URL, status.LastRoll.Issue) if err := am.Dismiss(activeAlert, alerting.USER_ALERTSERVER, msg); err != nil { util.LogErr(err) } } } else if status.LastRoll.Failed() { if err := am.AddAlert(&alerting.Alert{ Name: AUTOROLL_ALERT_NAME, Message: fmt.Sprintf("DEPS roll failed: %s/%d", autoroll.RIETVELD_URL, status.LastRoll.Issue), Nag: int64(3 * time.Hour), Actions: actions, }); err != nil { util.LogErr(err) } } } } } }() // Android device disconnects, hung buildslaves. go func() { // These builders are frequently slow. Ignore them when looking for hung buildslaves. hungSlavesIgnore := []string{ "Housekeeper-Nightly-RecreateSKPs_Canary", "Housekeeper-Weekly-RecreateSKPs", "Linux Builder", "Mac Builder", "Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-Valgrind", "Test-Ubuntu-GCC-ShuttleA-GPU-GTX550Ti-x86_64-Release-Valgrind", "Win Builder", } hangTimePeriod := 3 * time.Hour for _ = range time.Tick(tickInterval) { glog.Infof("Searching for hung buildslaves and disconnected Android devices.") builds, err := buildbot.GetUnfinishedBuilds() if err != nil { glog.Error(err) continue } for _, b := range builds { // Disconnected Android device? disconnectedAndroid := false if strings.Contains(b.Builder, "Android") && !strings.Contains(b.Builder, "Build") { for _, s := range b.Steps { if strings.Contains(s.Name, "wait for device") { // If "wait for device" has been running for 10 minutes, the device is probably offline. if s.Finished == 0 && time.Since(time.Unix(int64(s.Started), 0)) > 10*time.Minute { if err := am.AddAlert(&alerting.Alert{ Name: fmt.Sprintf("Android device disconnected (%s)", b.BuildSlave), Category: alerting.INFRA_ALERT, Message: fmt.Sprintf(ANDROID_DISCONNECT, b.BuildSlave, b.Master, b.Builder, b.Number, b.BuildSlave, b.BuildSlave), Nag: int64(3 * time.Hour), Actions: actions, }); err != nil { glog.Error(err) } disconnectedAndroid = true } } } } if !disconnectedAndroid && !util.ContainsAny(b.Builder, hungSlavesIgnore) { // Hung buildslave? for _, s := range b.Steps { if s.Name == "steps" { continue } // If the step has been running for over an hour, it's probably hung. if s.Finished == 0 && time.Since(time.Unix(int64(s.Started), 0)) > hangTimePeriod { if err := am.AddAlert(&alerting.Alert{ Name: fmt.Sprintf("Possibly hung buildslave (%s)", b.BuildSlave), Category: alerting.INFRA_ALERT, Message: fmt.Sprintf(HUNG_BUILDSLAVE, b.BuildSlave, hangTimePeriod.String(), b.Master, b.Builder, b.Number, b.BuildSlave, b.BuildSlave), Nag: int64(time.Hour), Actions: actions, AutoDismiss: int64(10 * tickInterval), }); err != nil { glog.Error(err) } } } } } } }() // Failed update_scripts. go func() { lastSearch := time.Now() for _ = range time.Tick(tickInterval) { glog.Infof("Searching for builds which failed update_scripts.") currentSearch := time.Now() builds, err := buildbot.GetBuildsFromDateRange(lastSearch, currentSearch) lastSearch = currentSearch if err != nil { glog.Error(err) continue } for _, b := range builds { for _, s := range b.Steps { if s.Name == "update_scripts" { if s.Results != 0 { if err := am.AddAlert(&alerting.Alert{ Name: "update_scripts failed", Category: alerting.INFRA_ALERT, Message: fmt.Sprintf(UPDATE_SCRIPTS, b.Builder, b.Master, b.Builder, b.Number, b.Builder, b.BuildSlave), Actions: actions, }); err != nil { glog.Error(err) } } break } } } } }() }