func monitorETCD(etcdAdapter *etcdstoreadapter.ETCDStoreAdapter) { out, err := os.Create(filepath.Join(outDir, "etcdstats.log")) if err != nil { logger.Fatal("etcd.log.creation.failure", err) } cleanup.Register(func() { out.Sync() }) go func() { ticker := time.NewTicker(time.Second) for { <-ticker.C t := time.Now() logger.Info("fetch.etcd.runonce.data") runOnceNodes, err := etcdAdapter.ListRecursively(Bbs.TaskSchemaRoot) if err != nil { logger.Info("fetch.etcd.runOnceNodes.error", err) } executorNode, err := etcdAdapter.ListRecursively(Bbs.ExecutorSchemaRoot) if err != nil { logger.Info("fetch.etcd.executorNode.error", err) } readTime := time.Since(t) d := etcdData{ Time: float64(time.Now().UnixNano()) / 1e9, RunningByExecutor: map[string]int{}, PresentExecutors: len(executorNode.ChildNodes), ReadTime: float64(readTime) / 1e9, } for _, node := range runOnceNodes.ChildNodes { runOnce, err := models.NewTaskFromJSON(node.Value) if err != nil { logger.Error("etcd.decode.runonce", err) continue } switch runOnce.State { case models.TaskStatePending: d.Pending++ case models.TaskStateClaimed: d.Claimed++ case models.TaskStateRunning: d.Running++ d.RunningByExecutor[runOnce.ExecutorID]++ case models.TaskStateCompleted: d.Completed++ } } logger.Info("fetched.etcd.runonce.data", time.Since(t), d.String()) out.Write(d.toJson()) out.Write([]byte("\n")) } }() }
func (h *Handler) syncTable(etcd *etcdstoreadapter.ETCDStoreAdapter, syncInterval time.Duration) { for { allNodes, _ := etcd.ListRecursively("/v1/routes") newTable := map[string]Route{} fanouts, _ := allNodes.Lookup("fanout") roundRobins, _ := allNodes.Lookup("round-robin") for _, host := range fanouts.ChildNodes { if len(host.ChildNodes) == 0 { continue } hostname := path.Base(host.Key) route := newTable[hostname] route.Dispatch = Fanout for _, endpoint := range host.ChildNodes { route.Endpoints = append( route.Endpoints, &Endpoint{Addr: path.Base(endpoint.Key)}, ) log.Println("registering", hostname, endpoint.Key) } newTable[hostname] = route } for _, host := range roundRobins.ChildNodes { if len(host.ChildNodes) == 0 { continue } hostname := path.Base(host.Key) route := newTable[hostname] route.Dispatch = RoundRobin for _, endpoint := range host.ChildNodes { route.Endpoints = append( route.Endpoints, &Endpoint{Addr: path.Base(endpoint.Key)}, ) log.Println("registering", hostname, endpoint.Key) } newTable[hostname] = route } h.Lock() h.table = newTable h.Unlock() time.Sleep(syncInterval) } }
func registerHandler(etcdAdapter *etcdstoreadapter.ETCDStoreAdapter, addr string, ready chan<- bool) error { node := storeadapter.StoreNode{ Key: "/v1/routes/round-robin/executor/" + addr, TTL: 60, } status, clearNode, err := etcdAdapter.MaintainNode(node) if err != nil { return err } tasks.Add(1) go func() { for { select { case locked, ok := <-status: if locked && ready != nil { ready <- true ready = nil } if !locked && ok { tasks.Done() logger.Fatal("maintain.route.fatal", map[string]interface{}{}) } if !ok { tasks.Done() return } case <-stop: close(clearNode) for _ = range status { } tasks.Done() return } } }() return nil }
func EmitRunOnceStates(datadogClient *datadog.Client, store *etcdstoreadapter.ETCDStoreAdapter, etcdMachines []string) { for { now := time.Now().Unix() all, err := store.ListRecursively("/v1/run_once") if err != nil { log.Println("failed to get all RunOnces:", err) time.Sleep(1 * time.Second) continue } metrics := []datadog.Metric{} for i, etcdMachine := range etcdMachines { stats := map[string]int{} resp, err := http.Get(urljoiner.Join(etcdMachine, "/v2/stats/store")) if err != nil { log.Println("failed to fetch stats:", err) continue } data, _ := ioutil.ReadAll(resp.Body) resp.Body.Close() json.Unmarshal(data, &stats) metrics = append(metrics, datadog.Metric{ Metric: fmt.Sprintf("etcd_watchers_%d", i), Points: []datadog.DataPoint{ datadog.DataPoint( [2]float64{ float64(now), float64(stats["watchers"]), }, ), }, }) } for _, state := range []string{"pending", "claimed", "running", "completed", "resolving"} { runOnces, found := all.Lookup(state) if !found { log.Println("failed to find RunOnces in", state, "state") time.Sleep(1 * time.Second) continue } metrics = append(metrics, datadog.Metric{ Metric: "diego_runonce_" + state, Points: []datadog.DataPoint{ datadog.DataPoint( [2]float64{ float64(now), float64(len(runOnces.ChildNodes)), }, ), }, }) } executors, err := store.ListRecursively("/v1/executor") if err != nil { log.Println("failed to get all Executors:", err) time.Sleep(1 * time.Second) continue } metrics = append(metrics, datadog.Metric{ Metric: "executors_maintaining_presence", Points: []datadog.DataPoint{ datadog.DataPoint( [2]float64{ float64(now), float64(len(executors.ChildNodes)), }, ), }, }) err = datadogClient.PostMetrics(metrics) if err != nil { log.Println("failed to post metrics:", err) } time.Sleep(1 * time.Second) } }