func main() { flag.Parse() //make the out dir logger.Component = "SIMULATOR" if outDir == "" { logger.Fatal("out.dir.unspecified") } err := os.MkdirAll(outDir, 0777) if err != nil { logger.Fatal("out.dir.creation.failed", err) } //set up logging outputFile, err := os.Create(filepath.Join(outDir, "simulator.log")) if err != nil { logger.Fatal("failed.to.create.simulator.log", err) } logger.Writer = io.MultiWriter(os.Stdout, outputFile) cleanup.Register(func() { outputFile.Sync() }) //compile the executor logger.Info("compiling.executor") output, err := exec.Command("go", "install", "github.com/cloudfoundry-incubator/simulator/game_executor").CombinedOutput() if err != nil { logger.Fatal("failed.to.compile.executor", string(output)) } //write info to the output dir writeInfo() //start etcd logger.Info("starting.etcd", etcdNodes) etcd = etcdstorerunner.NewETCDClusterRunner(4001, etcdNodes) etcd.Start() //set up the bbs pool := workerpool.NewWorkerPool(50) etcdAdapter = etcdstoreadapter.NewETCDStoreAdapter(etcd.NodeURLS(), pool) etcdAdapter.Connect() bbs = Bbs.New(etcdAdapter, timeprovider.NewTimeProvider()) //monitor etcd monitorETCD() //start executors startExecutors() cleanup.Register(func() { logger.Info("stopping.etcd", etcdNodes) etcd.Stop() }) //run the simulator runSimulation() cleanup.Exit(0) }
func startAndMonitorExecutor(index int, output *os.File, ready *sync.WaitGroup) { cmd := commandForExecutor(index, output) logger.Info("starting.executor", index) cmd.Start() time.Sleep(100 * time.Millisecond) //give it a second... ready.Done() shuttingDown := false cleanup.Register(func() { shuttingDown = true if cmd.Process != nil { cmd.Process.Kill() } }) restartCount := 0 for { err := cmd.Wait() logger.Error("executor.exited", index, err) if shuttingDown { return } restartCount++ logger.Info("restarting.executor", index, restartCount) cmd = commandForExecutor(index, output) cmd.Start() } }
func convergeRunOnces(bbs Bbs.ExecutorBBS) { statusChannel, releaseLock, err := bbs.MaintainConvergeLock(*convergenceInterval, *executorID) if err != nil { logger.Fatal("executor.converge-lock.acquire-failed", err) } tasks.Add(1) for { select { case locked, ok := <-statusChannel: if !ok { tasks.Done() return } if locked { t := time.Now() logger.Info("converging") bbs.ConvergeRunOnce(*timeToClaimRunOnce) logger.Info("converged", time.Since(t)) } else { logger.Error("lost.convergence.lock") } case <-stop: releaseLock <- nil } } }
func desireAllRunOnces() { logger.Info("desiring.runonces", nRunOnces) dt := over / time.Duration(nRunOnces) allDesired := &sync.WaitGroup{} for index := 1; index <= nRunOnces; index++ { allDesired.Add(1) runOnce := models.RunOnce{ Guid: fmt.Sprintf("%d", index), MemoryMB: runOnceMemory, } innerIndex := index go func() { defer allDesired.Done() logger.Info("desiring.runonce", innerIndex) registerDesired(innerIndex) err := bbs.DesireRunOnce(&runOnce) if err != nil { logger.Error("desire.runonce.failed", innerIndex, err) } logger.Info("desired.runonce", innerIndex) }() time.Sleep(dt) } allDesired.Wait() logger.Info("all.runonces.desired") }
func handleCompletedRunOnce(runOnce *models.RunOnce) { simulationLock.Lock() data, ok := runOnceTracker[runOnce.Guid] if !ok { logger.Error("uknown.runonce.completed", runOnce.Guid, "executor", runOnce.ExecutorID) simulationLock.Unlock() return } data.CompletionTime = float64(time.Now().UnixNano()) / 1e9 logger.Info("runonce.completed", runOnce.Guid, "executor", runOnce.ExecutorID, "duration", data.CompletionTime-data.DesiredTime) data.ExecutorIndex, _ = strconv.Atoi(runOnce.ExecutorID) data.NumCompletions++ numCompletions := data.NumCompletions simulationLock.Unlock() if numCompletions == 1 { defer simulationWait.Done() logger.Info("runonce.resolve", runOnce.Guid) err := bbs.ResolveRunOnce(runOnce) if err != nil { logger.Error("runonce.resolve.error", runOnce.Guid, err) return } logger.Info("runonce.resolved", runOnce.Guid) } }
func handleRunOnces(bbs Bbs.ExecutorBBS) { tasks.Add(1) for { logger.Info("watch.desired") runOnces, stopWatching, errors := bbs.WatchForDesiredRunOnce() INNER: for { select { case runOnce, ok := <-runOnces: if !ok { logger.Info("watch.desired.closed") break INNER } tasks.Add(1) go func() { handleRunOnce(bbs, runOnce) tasks.Done() }() case err, ok := <-errors: if ok && err != nil { logger.Error("watch.desired.error", err) } break INNER case <-stop: stopWatching <- true tasks.Done() } } } }
func handleRunOnce(bbs Bbs.ExecutorBBS, runOnce *models.RunOnce) { //hesitate logger.Info("handling.runonce", runOnce.Guid) sleepForARandomInterval("sleep.claim", 0, 100) //reserve memory ok := reserveMemory(runOnce.MemoryMB) if !ok { logger.Info("reserve.memory.failed", runOnce.Guid) return } defer releaseMemory(runOnce.MemoryMB) //mark claimed logger.Info("claiming.runonce", runOnce.Guid) err := bbs.ClaimRunOnce(runOnce, *executorID) if err != nil { logger.Info("claim.runonce.failed", runOnce.Guid, err) return } logger.Info("claimed.runonce", runOnce.Guid) //create container sleepForContainerCreationInterval() //mark started logger.Info("starting.runonce", runOnce.Guid) err = bbs.StartRunOnce(runOnce, "container") if err != nil { logger.Error("start.runonce.failed", runOnce.Guid, err) return } logger.Info("started.runonce", runOnce.Guid) //run sleepForRunInterval() //mark completed logger.Info("completing.runonce", runOnce.Guid) err = bbs.CompleteRunOnce(runOnce, false, "", "") if err != nil { logger.Error("complete.runonce.failed", runOnce.Guid, err) return } logger.Info("completed.runonce", runOnce.Guid) }
func monitorRunOnces(out io.Writer) { go func() { ticker := time.NewTicker(time.Second) for { <-ticker.C t := time.Now() logger.Info("fetch.etcd.runonce.data") runOnceNodes, err := etcdAdapter.ListRecursively(Bbs.RunOnceSchemaRoot) if err != nil { logger.Info("fetch.etcd.runOnceNodes.error", err) } executorNode, err := etcdAdapter.ListRecursively(Bbs.ExecutorSchemaRoot) if err != nil { logger.Info("fetch.etcd.executorNode.error", err) } readTime := time.Since(t) d := etcdData{ Time: float64(time.Now().UnixNano()) / 1e9, RunningByExecutor: map[string]int{}, PresentExecutors: len(executorNode.ChildNodes), ReadTime: float64(readTime) / 1e9, } for _, node := range runOnceNodes.ChildNodes { runOnce, err := models.NewRunOnceFromJSON(node.Value) if err != nil { logger.Error("etcd.decode.runonce", err) continue } switch runOnce.State { case models.RunOnceStatePending: d.Pending++ case models.RunOnceStateClaimed: d.Claimed++ case models.RunOnceStateRunning: d.Running++ d.RunningByExecutor[runOnce.ExecutorID]++ case models.RunOnceStateCompleted: d.Completed++ } } logger.Info("fetched.etcd.runonce.data", time.Since(t), d.String()) out.Write(d.toJson()) out.Write([]byte("\n")) } }() }
func runSimulation() { simulationLock = &sync.Mutex{} simulationWait = &sync.WaitGroup{} runOnceTracker = map[string]*runOnceData{} t := time.Now() logger.Info("simulation.start", nRunOnces) simulationWait.Add(nRunOnces) go watchForCompletedRunOnces() desireAllRunOnces() simulationWait.Wait() dt := time.Since(t) logger.Info("simulation.end", nRunOnces, "runtime", dt) simulationResult(dt) }
func startExecutors() { executorOutput, err := os.Create(filepath.Join(outDir, "executors.log")) if err != nil { logger.Fatal("executor.output.file.create.failed", err) } cleanup.Register(func() { executorOutput.Sync() }) logger.Info("starting.all.executors", nExecutors) allExecutorsStarted := &sync.WaitGroup{} for index := 1; index <= nExecutors; index++ { allExecutorsStarted.Add(1) go startAndMonitorExecutor(index, executorOutput, allExecutorsStarted) } allExecutorsStarted.Wait() logger.Info("started.all.executors", nExecutors) }
func main() { flag.Parse() cleanup.Register(func() { logger.Info("executor.shuttingdown") close(stop) tasks.Wait() logger.Info("executor.shutdown") }) logger.Component = fmt.Sprintf("EXECUTOR %s", *executorID) lock = &sync.Mutex{} currentMemory = *maxMemory etcdAdapter := etcdstoreadapter.NewETCDStoreAdapter( strings.Split(*etcdCluster, ","), workerpool.NewWorkerPool(10), ) err := etcdAdapter.Connect() if err != nil { logger.Fatal("etcd.connect.fatal", err) } tasks = &sync.WaitGroup{} stop = make(chan bool) bbs := Bbs.New(etcdAdapter, timeprovider.NewTimeProvider()) ready := make(chan bool, 1) err = maintainPresence(bbs, ready) if err != nil { logger.Fatal("executor.initializing-presence.failed", err) } go handleRunOnces(bbs) go convergeRunOnces(bbs) <-ready logger.Info("executor.up") select {} }
func watchForCompletedRunOnces() { for { logger.Info("watch.completed") runOnces, _, errs := bbs.WatchForCompletedRunOnce() waitForRunOnce: for { select { case runOnce, ok := <-runOnces: if !ok { logger.Info("watch.completed.closed") break waitForRunOnce } go handleCompletedRunOnce(runOnce) case err, ok := <-errs: if ok && err != nil { logger.Info("watch.completed.error", err) } break waitForRunOnce } } } }
func writeInfo() { data := fmt.Sprintf(`{ etcd_nodes:%d, executors:%d, run_onces:%d, executor_available_memory:%d, run_once_memory:%d, over:%.4f } `, etcdNodes, nExecutors, nRunOnces, executorMemory, runOnceMemory, float64(over)/float64(time.Second)) logger.Info("simulator.running", data) ioutil.WriteFile(filepath.Join(outDir, "info.json"), []byte(data), 0777) }
func sleepForARandomInterval(reason string, minSleepTime, maxSleepTime int) { interval := RAND.Intn(maxSleepTime-minSleepTime) + minSleepTime logger.Info(reason, fmt.Sprintf("%dms", interval)) time.Sleep(time.Duration(interval) * time.Millisecond) }