func handleCompletedRunOnce(runOnce *models.RunOnce) { simulationLock.Lock() data, ok := runOnceTracker[runOnce.Guid] if !ok { logger.Error("uknown.runonce.completed", runOnce.Guid, "executor", runOnce.ExecutorID) simulationLock.Unlock() return } data.CompletionTime = float64(time.Now().UnixNano()) / 1e9 logger.Info("runonce.completed", runOnce.Guid, "executor", runOnce.ExecutorID, "duration", data.CompletionTime-data.DesiredTime) data.ExecutorIndex, _ = strconv.Atoi(runOnce.ExecutorID) data.NumCompletions++ numCompletions := data.NumCompletions simulationLock.Unlock() if numCompletions == 1 { defer simulationWait.Done() logger.Info("runonce.resolve", runOnce.Guid) err := bbs.ResolveRunOnce(runOnce) if err != nil { logger.Error("runonce.resolve.error", runOnce.Guid, err) return } logger.Info("runonce.resolved", runOnce.Guid) } }
func handleRunOnce(bbs Bbs.ExecutorBBS, runOnce *models.RunOnce) { //hesitate logger.Info("handling.runonce", runOnce.Guid) sleepForARandomInterval("sleep.claim", 0, 100) //reserve memory ok := reserveMemory(runOnce.MemoryMB) if !ok { logger.Info("reserve.memory.failed", runOnce.Guid) return } defer releaseMemory(runOnce.MemoryMB) //mark claimed logger.Info("claiming.runonce", runOnce.Guid) err := bbs.ClaimRunOnce(runOnce, *executorID) if err != nil { logger.Info("claim.runonce.failed", runOnce.Guid, err) return } logger.Info("claimed.runonce", runOnce.Guid) //create container sleepForContainerCreationInterval() //mark started logger.Info("starting.runonce", runOnce.Guid) err = bbs.StartRunOnce(runOnce, "container") if err != nil { logger.Error("start.runonce.failed", runOnce.Guid, err) return } logger.Info("started.runonce", runOnce.Guid) //run sleepForRunInterval() //mark completed logger.Info("completing.runonce", runOnce.Guid) err = bbs.CompleteRunOnce(runOnce, false, "", "") if err != nil { logger.Error("complete.runonce.failed", runOnce.Guid, err) return } logger.Info("completed.runonce", runOnce.Guid) }
func startAndMonitorExecutor(index int, output *os.File, ready *sync.WaitGroup) { cmd := commandForExecutor(index, output) logger.Info("starting.executor", index) cmd.Start() time.Sleep(100 * time.Millisecond) //give it a second... ready.Done() shuttingDown := false cleanup.Register(func() { shuttingDown = true if cmd.Process != nil { cmd.Process.Kill() } }) restartCount := 0 for { err := cmd.Wait() logger.Error("executor.exited", index, err) if shuttingDown { return } restartCount++ logger.Info("restarting.executor", index, restartCount) cmd = commandForExecutor(index, output) cmd.Start() } }
func desireAllRunOnces() { logger.Info("desiring.runonces", nRunOnces) dt := over / time.Duration(nRunOnces) allDesired := &sync.WaitGroup{} for index := 1; index <= nRunOnces; index++ { allDesired.Add(1) runOnce := models.RunOnce{ Guid: fmt.Sprintf("%d", index), MemoryMB: runOnceMemory, } innerIndex := index go func() { defer allDesired.Done() logger.Info("desiring.runonce", innerIndex) registerDesired(innerIndex) err := bbs.DesireRunOnce(&runOnce) if err != nil { logger.Error("desire.runonce.failed", innerIndex, err) } logger.Info("desired.runonce", innerIndex) }() time.Sleep(dt) } allDesired.Wait() logger.Info("all.runonces.desired") }
func convergeRunOnces(bbs Bbs.ExecutorBBS) { statusChannel, releaseLock, err := bbs.MaintainConvergeLock(*convergenceInterval, *executorID) if err != nil { logger.Fatal("executor.converge-lock.acquire-failed", err) } tasks.Add(1) for { select { case locked, ok := <-statusChannel: if !ok { tasks.Done() return } if locked { t := time.Now() logger.Info("converging") bbs.ConvergeRunOnce(*timeToClaimRunOnce) logger.Info("converged", time.Since(t)) } else { logger.Error("lost.convergence.lock") } case <-stop: releaseLock <- nil } } }
func handleRunOnces(bbs Bbs.ExecutorBBS) { tasks.Add(1) for { logger.Info("watch.desired") runOnces, stopWatching, errors := bbs.WatchForDesiredRunOnce() INNER: for { select { case runOnce, ok := <-runOnces: if !ok { logger.Info("watch.desired.closed") break INNER } tasks.Add(1) go func() { handleRunOnce(bbs, runOnce) tasks.Done() }() case err, ok := <-errors: if ok && err != nil { logger.Error("watch.desired.error", err) } break INNER case <-stop: stopWatching <- true tasks.Done() } } } }
func (d *etcdData) toJson() []byte { data, err := json.Marshal(d) if err != nil { logger.Error("etcd.marshal.etcdData.failed", err) } return data }
func releaseMemory(memory int) { lock.Lock() defer lock.Unlock() currentMemory = currentMemory + memory if currentMemory > *maxMemory { logger.Error("bookkeeping.fail", "current memory exceeds original max memory... how?") currentMemory = *maxMemory } }
func monitorRunOnces(out io.Writer) { go func() { ticker := time.NewTicker(time.Second) for { <-ticker.C t := time.Now() logger.Info("fetch.etcd.runonce.data") runOnceNodes, err := etcdAdapter.ListRecursively(Bbs.RunOnceSchemaRoot) if err != nil { logger.Info("fetch.etcd.runOnceNodes.error", err) } executorNode, err := etcdAdapter.ListRecursively(Bbs.ExecutorSchemaRoot) if err != nil { logger.Info("fetch.etcd.executorNode.error", err) } readTime := time.Since(t) d := etcdData{ Time: float64(time.Now().UnixNano()) / 1e9, RunningByExecutor: map[string]int{}, PresentExecutors: len(executorNode.ChildNodes), ReadTime: float64(readTime) / 1e9, } for _, node := range runOnceNodes.ChildNodes { runOnce, err := models.NewRunOnceFromJSON(node.Value) if err != nil { logger.Error("etcd.decode.runonce", err) continue } switch runOnce.State { case models.RunOnceStatePending: d.Pending++ case models.RunOnceStateClaimed: d.Claimed++ case models.RunOnceStateRunning: d.Running++ d.RunningByExecutor[runOnce.ExecutorID]++ case models.RunOnceStateCompleted: d.Completed++ } } logger.Info("fetched.etcd.runonce.data", time.Since(t), d.String()) out.Write(d.toJson()) out.Write([]byte("\n")) } }() }