func monitorETCD(etcdAdapter *etcdstoreadapter.ETCDStoreAdapter) { out, err := os.Create(filepath.Join(outDir, "etcdstats.log")) if err != nil { logger.Fatal("etcd.log.creation.failure", err) } cleanup.Register(func() { out.Sync() }) go func() { ticker := time.NewTicker(time.Second) for { <-ticker.C t := time.Now() logger.Info("fetch.etcd.runonce.data") runOnceNodes, err := etcdAdapter.ListRecursively(Bbs.TaskSchemaRoot) if err != nil { logger.Info("fetch.etcd.runOnceNodes.error", err) } executorNode, err := etcdAdapter.ListRecursively(Bbs.ExecutorSchemaRoot) if err != nil { logger.Info("fetch.etcd.executorNode.error", err) } readTime := time.Since(t) d := etcdData{ Time: float64(time.Now().UnixNano()) / 1e9, RunningByExecutor: map[string]int{}, PresentExecutors: len(executorNode.ChildNodes), ReadTime: float64(readTime) / 1e9, } for _, node := range runOnceNodes.ChildNodes { runOnce, err := models.NewTaskFromJSON(node.Value) if err != nil { logger.Error("etcd.decode.runonce", err) continue } switch runOnce.State { case models.TaskStatePending: d.Pending++ case models.TaskStateClaimed: d.Claimed++ case models.TaskStateRunning: d.Running++ d.RunningByExecutor[runOnce.ExecutorID]++ case models.TaskStateCompleted: d.Completed++ } } logger.Info("fetched.etcd.runonce.data", time.Since(t), d.String()) out.Write(d.toJson()) out.Write([]byte("\n")) } }() }
func writeInfo() { data := fmt.Sprintf(`{ "run_onces": %d, "run_once_memory": %d } `, nTasks, taskMemory) logger.Info("simulator.running", data) ioutil.WriteFile(filepath.Join(outDir, "info.json"), []byte(data), 0644) }
func runSimulation(natsClient yagnats.NATSClient) { simulationLock = &sync.Mutex{} simulationWait = &sync.WaitGroup{} taskTracker = map[string]*taskData{} msg := stagingMessage{ Count: nTasks, MemoryMB: taskMemory, } payload, err := json.Marshal(msg) if err != nil { panic(err) } t := time.Now() logger.Info("simulation.start", nTasks) simulationWait.Add(nTasks) _, err = natsClient.Subscribe("info.stager.*.staging-request.desire", func(msg *yagnats.Message) { var desiredLog struct { Timestamp time.Time `json:"_timestamp"` Task models.Task `json:"task"` } err := json.Unmarshal(msg.Payload, &desiredLog) if err != nil { panic(err) } registerDesired(desiredLog.Task.Guid, desiredLog.Timestamp) }) _, err = natsClient.Subscribe("error.>", func(msg *yagnats.Message) { var errorLog struct { Timestamp time.Time `json:"_timestamp"` Error string `json:"error"` } err := json.Unmarshal(msg.Payload, &errorLog) if err != nil { panic(err) } registerError(msg.Subject+": "+errorLog.Error, errorLog.Timestamp) }) _, err = natsClient.Subscribe("fatal.>", func(msg *yagnats.Message) { var errorLog struct { Timestamp time.Time `json:"_timestamp"` Error string `json:"error"` } err := json.Unmarshal(msg.Payload, &errorLog) if err != nil { panic(err) } registerError(msg.Subject+": "+errorLog.Error, errorLog.Timestamp) }) executorIndexes := map[string]int{} _, err = natsClient.Subscribe("completed-task", func(msg *yagnats.Message) { defer func() { e := recover() if e != nil { logger.Error("RECOVERED PANIC:", e) } }() var task *models.Task err := json.Unmarshal(msg.Payload, &task) if err != nil { panic(err) } simulationLock.Lock() index, ok := executorIndexes[task.ExecutorID] if !ok { index = len(executorIndexes) + 1 executorIndexes[task.ExecutorID] = index } data, ok := taskTracker[task.Guid] if !ok { logger.Error("uknown.runonce.completed", task.Guid, "executor", task.ExecutorID) simulationLock.Unlock() return } data.CompletionTime = float64(time.Now().UnixNano()) / 1e9 logger.Info("runonce.completed", task.Guid, "executor", task.ExecutorID, "duration", data.CompletionTime-data.DesiredTime) data.ExecutorIndex = index data.NumCompletions++ simulationLock.Unlock() simulationWait.Done() }) if err != nil { panic(err) } err = natsClient.PublishWithReplyTo("stage", "completed-task", payload) if err != nil { panic(err) } cleanup.Register(func() { dt := time.Since(t) logger.Info("simulation.end", nTasks, "runtime", dt) simulationResult(dt) simulationErrors() }) simulationWait.Wait() }