func main() { flag.Parse() //make the out dir logger.Component = "SIMULATOR" if outDir == "" { logger.Fatal("out.dir.unspecified") } err := os.MkdirAll(outDir, 0777) if err != nil { logger.Fatal("out.dir.creation.failed", err) } //set up logging outputFile, err := os.Create(filepath.Join(outDir, "simulator.log")) if err != nil { logger.Fatal("failed.to.create.simulator.log", err) } logger.Writer = io.MultiWriter(os.Stdout, outputFile) cleanup.Register(func() { outputFile.Sync() }) //compile the executor logger.Info("compiling.executor") output, err := exec.Command("go", "install", "github.com/cloudfoundry-incubator/simulator/game_executor").CombinedOutput() if err != nil { logger.Fatal("failed.to.compile.executor", string(output)) } //write info to the output dir writeInfo() //start etcd logger.Info("starting.etcd", etcdNodes) etcd = etcdstorerunner.NewETCDClusterRunner(4001, etcdNodes) etcd.Start() //set up the bbs pool := workerpool.NewWorkerPool(50) etcdAdapter = etcdstoreadapter.NewETCDStoreAdapter(etcd.NodeURLS(), pool) etcdAdapter.Connect() bbs = Bbs.New(etcdAdapter, timeprovider.NewTimeProvider()) //monitor etcd monitorETCD() //start executors startExecutors() cleanup.Register(func() { logger.Info("stopping.etcd", etcdNodes) etcd.Stop() }) //run the simulator runSimulation() cleanup.Exit(0) }
func monitorETCD(etcdAdapter *etcdstoreadapter.ETCDStoreAdapter) { out, err := os.Create(filepath.Join(outDir, "etcdstats.log")) if err != nil { logger.Fatal("etcd.log.creation.failure", err) } cleanup.Register(func() { out.Sync() }) go func() { ticker := time.NewTicker(time.Second) for { <-ticker.C t := time.Now() logger.Info("fetch.etcd.runonce.data") runOnceNodes, err := etcdAdapter.ListRecursively(Bbs.TaskSchemaRoot) if err != nil { logger.Info("fetch.etcd.runOnceNodes.error", err) } executorNode, err := etcdAdapter.ListRecursively(Bbs.ExecutorSchemaRoot) if err != nil { logger.Info("fetch.etcd.executorNode.error", err) } readTime := time.Since(t) d := etcdData{ Time: float64(time.Now().UnixNano()) / 1e9, RunningByExecutor: map[string]int{}, PresentExecutors: len(executorNode.ChildNodes), ReadTime: float64(readTime) / 1e9, } for _, node := range runOnceNodes.ChildNodes { runOnce, err := models.NewTaskFromJSON(node.Value) if err != nil { logger.Error("etcd.decode.runonce", err) continue } switch runOnce.State { case models.TaskStatePending: d.Pending++ case models.TaskStateClaimed: d.Claimed++ case models.TaskStateRunning: d.Running++ d.RunningByExecutor[runOnce.ExecutorID]++ case models.TaskStateCompleted: d.Completed++ } } logger.Info("fetched.etcd.runonce.data", time.Since(t), d.String()) out.Write(d.toJson()) out.Write([]byte("\n")) } }() }
func startAndMonitorExecutor(index int, output *os.File, ready *sync.WaitGroup) { cmd := commandForExecutor(index, output) logger.Info("starting.executor", index) cmd.Start() time.Sleep(100 * time.Millisecond) //give it a second... ready.Done() shuttingDown := false cleanup.Register(func() { shuttingDown = true if cmd.Process != nil { cmd.Process.Kill() } }) restartCount := 0 for { err := cmd.Wait() logger.Error("executor.exited", index, err) if shuttingDown { return } restartCount++ logger.Info("restarting.executor", index, restartCount) cmd = commandForExecutor(index, output) cmd.Start() } }
func monitorETCD() { outputFile, err := os.Create(filepath.Join(outDir, "etcdstats.log")) if err != nil { logger.Fatal("etcd.log.creation.failure", err) } cleanup.Register(func() { outputFile.Sync() }) go monitorRunOnces(outputFile) }
func startExecutors() { executorOutput, err := os.Create(filepath.Join(outDir, "executors.log")) if err != nil { logger.Fatal("executor.output.file.create.failed", err) } cleanup.Register(func() { executorOutput.Sync() }) logger.Info("starting.all.executors", nExecutors) allExecutorsStarted := &sync.WaitGroup{} for index := 1; index <= nExecutors; index++ { allExecutorsStarted.Add(1) go startAndMonitorExecutor(index, executorOutput, allExecutorsStarted) } allExecutorsStarted.Wait() logger.Info("started.all.executors", nExecutors) }
func main() { flag.Parse() cleanup.Register(func() { logger.Info("executor.shuttingdown") close(stop) tasks.Wait() logger.Info("executor.shutdown") }) logger.Component = fmt.Sprintf("EXECUTOR %s", *executorID) lock = &sync.Mutex{} currentMemory = *maxMemory etcdAdapter := etcdstoreadapter.NewETCDStoreAdapter( strings.Split(*etcdCluster, ","), workerpool.NewWorkerPool(10), ) err := etcdAdapter.Connect() if err != nil { logger.Fatal("etcd.connect.fatal", err) } tasks = &sync.WaitGroup{} stop = make(chan bool) bbs := Bbs.New(etcdAdapter, timeprovider.NewTimeProvider()) ready := make(chan bool, 1) err = maintainPresence(bbs, ready) if err != nil { logger.Fatal("executor.initializing-presence.failed", err) } go handleRunOnces(bbs) go convergeRunOnces(bbs) <-ready logger.Info("executor.up") select {} }
func main() { flag.Parse() runtime.GOMAXPROCS(runtime.NumCPU()) //make the out dir logger.Component = "SIMULATOR" if outDir == "" { logger.Fatal("out.dir.unspecified") } err := os.MkdirAll(outDir, 0777) if err != nil { logger.Fatal("out.dir.creation.failed", err) } //set up logging outputFile, err := os.Create(filepath.Join(outDir, "simulator.log")) if err != nil { logger.Fatal("failed.to.create.simulator.log", err) } logger.Writer = io.MultiWriter(os.Stdout, outputFile) cleanup.Register(func() { outputFile.Sync() }) //start etcd natsClient := yagnats.NewClient() natsMembers := []yagnats.ConnectionProvider{} for _, addr := range strings.Split(*natsAddresses, ",") { natsMembers = append( natsMembers, &yagnats.ConnectionInfo{addr, *natsUsername, *natsPassword}, ) } natsInfo := &yagnats.ConnectionCluster{Members: natsMembers} err = natsClient.Connect(natsInfo) if err != nil { logger.Fatal("could not connect to nats:", err) } logger.Component = "simulator" etcdAdapter := etcdstoreadapter.NewETCDStoreAdapter( strings.Split(*etcdCluster, ","), workerpool.NewWorkerPool(10), ) err = etcdAdapter.Connect() if err != nil { logger.Fatal("etcd.connect-failed", map[string]interface{}{ "error": err.Error(), }) } //write info to the output dir writeInfo() //monitor etcd monitorETCD(etcdAdapter) //run the simulator runSimulation(natsClient) cleanup.Exit(0) }
func main() { var err error runtime.GOMAXPROCS(runtime.NumCPU()) rand.Seed(time.Now().UnixNano()) flag.Parse() executorUUID, err := uuid.NewV4() if err != nil { log.Fatalln("could not generate guid:", err) } executorID = executorUUID.String() cleanup.Register(func() { once.Do(func() { logger.Info("shutting-down", map[string]interface{}{}) close(stop) tasks.Wait() logger.Info("shutdown", map[string]interface{}{}) }) }) natsClient := yagnats.NewClient() natsMembers := []yagnats.ConnectionProvider{} for _, addr := range strings.Split(*natsAddresses, ",") { natsMembers = append( natsMembers, &yagnats.ConnectionInfo{addr, *natsUsername, *natsPassword}, ) } natsInfo := &yagnats.ConnectionCluster{Members: natsMembers} err = logger.Connect(natsInfo) if err != nil { log.Fatalln("could not connect logger:", err) } err = natsClient.Connect(natsInfo) if err != nil { log.Fatalln("could not connect to nats:", err) } logger.Component = fmt.Sprintf("executor.%s", executorID) etcdAdapter := etcdstoreadapter.NewETCDStoreAdapter( strings.Split(*etcdCluster, ","), workerpool.NewWorkerPool(10), ) err = etcdAdapter.Connect() if err != nil { logger.Fatal("etcd.connect-failed", map[string]interface{}{ "error": err.Error(), }) } bbs := bbs.New(bbs.NewHurlerKicker(*hurlerAddress), etcdAdapter, timeprovider.NewTimeProvider()) ready := make(chan bool, 1) err = maintainPresence(bbs, ready) if err != nil { logger.Fatal("initializing-presence", map[string]interface{}{ "error": err.Error(), }) } err = registerHandler(etcdAdapter, *listenAddr, ready) if err != nil { logger.Fatal("initializing-route", map[string]interface{}{ "error": err.Error(), }) } go handleTasks(bbs, *listenAddr) go convergeTasks(bbs) <-ready <-ready logger.Info("up", map[string]interface{}{ "executor": executorID, }) select {} }
func runSimulation(natsClient yagnats.NATSClient) { simulationLock = &sync.Mutex{} simulationWait = &sync.WaitGroup{} taskTracker = map[string]*taskData{} msg := stagingMessage{ Count: nTasks, MemoryMB: taskMemory, } payload, err := json.Marshal(msg) if err != nil { panic(err) } t := time.Now() logger.Info("simulation.start", nTasks) simulationWait.Add(nTasks) _, err = natsClient.Subscribe("info.stager.*.staging-request.desire", func(msg *yagnats.Message) { var desiredLog struct { Timestamp time.Time `json:"_timestamp"` Task models.Task `json:"task"` } err := json.Unmarshal(msg.Payload, &desiredLog) if err != nil { panic(err) } registerDesired(desiredLog.Task.Guid, desiredLog.Timestamp) }) _, err = natsClient.Subscribe("error.>", func(msg *yagnats.Message) { var errorLog struct { Timestamp time.Time `json:"_timestamp"` Error string `json:"error"` } err := json.Unmarshal(msg.Payload, &errorLog) if err != nil { panic(err) } registerError(msg.Subject+": "+errorLog.Error, errorLog.Timestamp) }) _, err = natsClient.Subscribe("fatal.>", func(msg *yagnats.Message) { var errorLog struct { Timestamp time.Time `json:"_timestamp"` Error string `json:"error"` } err := json.Unmarshal(msg.Payload, &errorLog) if err != nil { panic(err) } registerError(msg.Subject+": "+errorLog.Error, errorLog.Timestamp) }) executorIndexes := map[string]int{} _, err = natsClient.Subscribe("completed-task", func(msg *yagnats.Message) { defer func() { e := recover() if e != nil { logger.Error("RECOVERED PANIC:", e) } }() var task *models.Task err := json.Unmarshal(msg.Payload, &task) if err != nil { panic(err) } simulationLock.Lock() index, ok := executorIndexes[task.ExecutorID] if !ok { index = len(executorIndexes) + 1 executorIndexes[task.ExecutorID] = index } data, ok := taskTracker[task.Guid] if !ok { logger.Error("uknown.runonce.completed", task.Guid, "executor", task.ExecutorID) simulationLock.Unlock() return } data.CompletionTime = float64(time.Now().UnixNano()) / 1e9 logger.Info("runonce.completed", task.Guid, "executor", task.ExecutorID, "duration", data.CompletionTime-data.DesiredTime) data.ExecutorIndex = index data.NumCompletions++ simulationLock.Unlock() simulationWait.Done() }) if err != nil { panic(err) } err = natsClient.PublishWithReplyTo("stage", "completed-task", payload) if err != nil { panic(err) } cleanup.Register(func() { dt := time.Since(t) logger.Info("simulation.end", nTasks, "runtime", dt) simulationResult(dt) simulationErrors() }) simulationWait.Wait() }