func monitorETCD(etcdAdapter *etcdstoreadapter.ETCDStoreAdapter) { out, err := os.Create(filepath.Join(outDir, "etcdstats.log")) if err != nil { logger.Fatal("etcd.log.creation.failure", err) } cleanup.Register(func() { out.Sync() }) go func() { ticker := time.NewTicker(time.Second) for { <-ticker.C t := time.Now() logger.Info("fetch.etcd.runonce.data") runOnceNodes, err := etcdAdapter.ListRecursively(Bbs.TaskSchemaRoot) if err != nil { logger.Info("fetch.etcd.runOnceNodes.error", err) } executorNode, err := etcdAdapter.ListRecursively(Bbs.ExecutorSchemaRoot) if err != nil { logger.Info("fetch.etcd.executorNode.error", err) } readTime := time.Since(t) d := etcdData{ Time: float64(time.Now().UnixNano()) / 1e9, RunningByExecutor: map[string]int{}, PresentExecutors: len(executorNode.ChildNodes), ReadTime: float64(readTime) / 1e9, } for _, node := range runOnceNodes.ChildNodes { runOnce, err := models.NewTaskFromJSON(node.Value) if err != nil { logger.Error("etcd.decode.runonce", err) continue } switch runOnce.State { case models.TaskStatePending: d.Pending++ case models.TaskStateClaimed: d.Claimed++ case models.TaskStateRunning: d.Running++ d.RunningByExecutor[runOnce.ExecutorID]++ case models.TaskStateCompleted: d.Completed++ } } logger.Info("fetched.etcd.runonce.data", time.Since(t), d.String()) out.Write(d.toJson()) out.Write([]byte("\n")) } }() }
func getAllTasks(store storeadapter.StoreAdapter, state models.TaskState) ([]*models.Task, error) { node, err := store.ListRecursively(TaskSchemaRoot) if err == storeadapter.ErrorKeyNotFound { return []*models.Task{}, nil } if err != nil { return []*models.Task{}, err } tasks := []*models.Task{} for _, node := range node.ChildNodes { task, err := models.NewTaskFromJSON(node.Value) if err != nil { steno.NewLogger("bbs").Errorf("cannot parse task JSON for key %s: %s", node.Key, err.Error()) } else if task.State == state { tasks = append(tasks, &task) } } return tasks, nil }
// ConvergeTasks is run by *one* executor every X seconds (doesn't really matter what X is.. pick something performant) // Converge will: // 1. Kick (by setting) any run-onces that are still pending // 2. Kick (by setting) any run-onces that are completed // 3. Demote to pending any claimed run-onces that have been claimed for > 30s // 4. Demote to completed any resolving run-onces that have been resolving for > 30s // 5. Mark as failed any run-onces that have been in the pending state for > timeToClaim // 6. Mark as failed any claimed or running run-onces whose executor has stopped maintaining presence func (self *executorBBS) ConvergeTasks(timeToClaim time.Duration) { taskState, err := self.store.ListRecursively(TaskSchemaRoot) if err != nil { return } executorState, err := self.store.ListRecursively(ExecutorSchemaRoot) if err == storeadapter.ErrorKeyNotFound { executorState = storeadapter.StoreNode{} } else if err != nil { return } logger := gosteno.NewLogger("bbs") logError := func(task models.Task, message string) { logger.Errord(map[string]interface{}{ "runonce": task, }, message) } keysToDelete := []string{} unclaimedTimeoutBoundary := self.timeProvider.Time().Add(-timeToClaim).UnixNano() tasksToCAS := [][]models.Task{} scheduleForCAS := func(oldTask, newTask models.Task) { tasksToCAS = append(tasksToCAS, []models.Task{ oldTask, newTask, }) } for _, node := range taskState.ChildNodes { task, err := models.NewTaskFromJSON(node.Value) if err != nil { logger.Errord(map[string]interface{}{ "key": node.Key, "value": string(node.Value), }, "runonce.converge.json-parse-failure") keysToDelete = append(keysToDelete, node.Key) continue } switch task.State { case models.TaskStatePending: if task.CreatedAt <= unclaimedTimeoutBoundary { logError(task, "runonce.converge.failed-to-claim") scheduleForCAS(task, markTaskFailed(task, "not claimed within time limit")) } else { go self.kicker.Desire(&task) } case models.TaskStateClaimed: claimedTooLong := self.timeProvider.Time().Sub(time.Unix(0, task.UpdatedAt)) >= 30*time.Second _, executorIsAlive := executorState.Lookup(task.ExecutorID) if !executorIsAlive { logError(task, "runonce.converge.executor-disappeared") scheduleForCAS(task, markTaskFailed(task, "executor disappeared before completion")) } else if claimedTooLong { logError(task, "runonce.converge.failed-to-start") scheduleForCAS(task, demoteToPending(task)) } case models.TaskStateRunning: _, executorIsAlive := executorState.Lookup(task.ExecutorID) if !executorIsAlive { logError(task, "runonce.converge.executor-disappeared") scheduleForCAS(task, markTaskFailed(task, "executor disappeared before completion")) } case models.TaskStateCompleted: go self.kicker.Complete(&task) case models.TaskStateResolving: resolvingTooLong := self.timeProvider.Time().Sub(time.Unix(0, task.UpdatedAt)) >= 30*time.Second if resolvingTooLong { logError(task, "runonce.converge.failed-to-resolve") scheduleForCAS(task, demoteToCompleted(task)) } } } self.batchCompareAndSwapTasks(tasksToCAS, logger) self.store.Delete(keysToDelete...) }