func EmitRunOnceStates(datadogClient *datadog.Client, store *etcdstoreadapter.ETCDStoreAdapter, etcdMachines []string) { for { now := time.Now().Unix() all, err := store.ListRecursively("/v1/run_once") if err != nil { log.Println("failed to get all RunOnces:", err) time.Sleep(1 * time.Second) continue } metrics := []datadog.Metric{} for i, etcdMachine := range etcdMachines { stats := map[string]int{} resp, err := http.Get(urljoiner.Join(etcdMachine, "/v2/stats/store")) if err != nil { log.Println("failed to fetch stats:", err) continue } data, _ := ioutil.ReadAll(resp.Body) resp.Body.Close() json.Unmarshal(data, &stats) metrics = append(metrics, datadog.Metric{ Metric: fmt.Sprintf("etcd_watchers_%d", i), Points: []datadog.DataPoint{ datadog.DataPoint( [2]float64{ float64(now), float64(stats["watchers"]), }, ), }, }) } for _, state := range []string{"pending", "claimed", "running", "completed", "resolving"} { runOnces, found := all.Lookup(state) if !found { log.Println("failed to find RunOnces in", state, "state") time.Sleep(1 * time.Second) continue } metrics = append(metrics, datadog.Metric{ Metric: "diego_runonce_" + state, Points: []datadog.DataPoint{ datadog.DataPoint( [2]float64{ float64(now), float64(len(runOnces.ChildNodes)), }, ), }, }) } executors, err := store.ListRecursively("/v1/executor") if err != nil { log.Println("failed to get all Executors:", err) time.Sleep(1 * time.Second) continue } metrics = append(metrics, datadog.Metric{ Metric: "executors_maintaining_presence", Points: []datadog.DataPoint{ datadog.DataPoint( [2]float64{ float64(now), float64(len(executors.ChildNodes)), }, ), }, }) err = datadogClient.PostMetrics(metrics) if err != nil { log.Println("failed to post metrics:", err) } time.Sleep(1 * time.Second) } }
func RunonceStampede(bbs *bbs.BBS, datadogClient *datadog.Client, runOnce *models.RunOnce, runOnceCount int) { completed, stop, errs := bbs.WatchForCompletedRunOnce() startAll := time.Now() if datadogClient != nil { event, err := datadogClient.PostEvent(&datadog.Event{ Title: "diego_runonce_stampede_start", Text: "started the stampede", Tags: []string{fmt.Sprintf("count:%d", runOnceCount)}, }) log.Println("posted start event:", event, err) defer func() { event, err := datadogClient.PostEvent(&datadog.Event{ Title: "diego_runonce_stampede_stop", Text: "stopped the stampede", Tags: []string{ fmt.Sprintf("count:%d", runOnceCount), fmt.Sprintf("duration:%s", time.Since(startAll)), }, }) log.Println("posted stop event:", event, err) }() } startTimes := make(chan runOnceTime, runOnceCount) go func() { for i := 0; i < runOnceCount; i++ { go createRunOnce(runOnce, startTimes, bbs) } }() seenRunOnces := 0 runOnceStartTimes := make(map[string]time.Time) waitGroup := &sync.WaitGroup{} timer := time.After(100 * time.Minute) OUTER: for { if seenRunOnces == runOnceCount { timer = time.After(30 * time.Second) } select { case startTime := <-startTimes: runOnceStartTimes[startTime.guid] = startTime.startTime case completedRunOnce := <-completed: startedAt, found := runOnceStartTimes[completedRunOnce.Guid] if !found { continue } log.Println("done:", seenRunOnces, RunOnceResult{ Guid: completedRunOnce.Guid, Duration: time.Since(startedAt), Failed: completedRunOnce.Failed, }) seenRunOnces++ waitGroup.Add(1) go func() { log.Println("deleting", completedRunOnce.Guid) err := bbs.ResolveRunOnce(completedRunOnce) if err != nil { log.Println("failed to resolve run once:", completedRunOnce.Guid, err) } else { log.Println("deleted:", completedRunOnce.Guid) } waitGroup.Done() }() case err := <-errs: log.Println("watch error:", err) case <-timer: break OUTER } } waitGroup.Wait() close(stop) }