Esempio n. 1
0
func monitorETCD(etcdAdapter *etcdstoreadapter.ETCDStoreAdapter) {
	out, err := os.Create(filepath.Join(outDir, "etcdstats.log"))
	if err != nil {
		logger.Fatal("etcd.log.creation.failure", err)
	}

	cleanup.Register(func() {
		out.Sync()
	})

	go func() {
		ticker := time.NewTicker(time.Second)
		for {
			<-ticker.C
			t := time.Now()
			logger.Info("fetch.etcd.runonce.data")
			runOnceNodes, err := etcdAdapter.ListRecursively(Bbs.TaskSchemaRoot)
			if err != nil {
				logger.Info("fetch.etcd.runOnceNodes.error", err)
			}

			executorNode, err := etcdAdapter.ListRecursively(Bbs.ExecutorSchemaRoot)
			if err != nil {
				logger.Info("fetch.etcd.executorNode.error", err)
			}
			readTime := time.Since(t)

			d := etcdData{
				Time:              float64(time.Now().UnixNano()) / 1e9,
				RunningByExecutor: map[string]int{},
				PresentExecutors:  len(executorNode.ChildNodes),
				ReadTime:          float64(readTime) / 1e9,
			}

			for _, node := range runOnceNodes.ChildNodes {
				runOnce, err := models.NewTaskFromJSON(node.Value)
				if err != nil {
					logger.Error("etcd.decode.runonce", err)
					continue
				}

				switch runOnce.State {
				case models.TaskStatePending:
					d.Pending++
				case models.TaskStateClaimed:
					d.Claimed++
				case models.TaskStateRunning:
					d.Running++
					d.RunningByExecutor[runOnce.ExecutorID]++
				case models.TaskStateCompleted:
					d.Completed++
				}
			}

			logger.Info("fetched.etcd.runonce.data", time.Since(t), d.String())
			out.Write(d.toJson())
			out.Write([]byte("\n"))
		}
	}()
}
Esempio n. 2
0
func getAllTasks(store storeadapter.StoreAdapter, state models.TaskState) ([]*models.Task, error) {
	node, err := store.ListRecursively(TaskSchemaRoot)
	if err == storeadapter.ErrorKeyNotFound {
		return []*models.Task{}, nil
	}

	if err != nil {
		return []*models.Task{}, err
	}

	tasks := []*models.Task{}
	for _, node := range node.ChildNodes {
		task, err := models.NewTaskFromJSON(node.Value)
		if err != nil {
			steno.NewLogger("bbs").Errorf("cannot parse task JSON for key %s: %s", node.Key, err.Error())
		} else if task.State == state {
			tasks = append(tasks, &task)
		}
	}

	return tasks, nil
}
Esempio n. 3
0
// ConvergeTasks is run by *one* executor every X seconds (doesn't really matter what X is.. pick something performant)
// Converge will:
// 1. Kick (by setting) any run-onces that are still pending
// 2. Kick (by setting) any run-onces that are completed
// 3. Demote to pending any claimed run-onces that have been claimed for > 30s
// 4. Demote to completed any resolving run-onces that have been resolving for > 30s
// 5. Mark as failed any run-onces that have been in the pending state for > timeToClaim
// 6. Mark as failed any claimed or running run-onces whose executor has stopped maintaining presence
func (self *executorBBS) ConvergeTasks(timeToClaim time.Duration) {
	taskState, err := self.store.ListRecursively(TaskSchemaRoot)
	if err != nil {
		return
	}

	executorState, err := self.store.ListRecursively(ExecutorSchemaRoot)
	if err == storeadapter.ErrorKeyNotFound {
		executorState = storeadapter.StoreNode{}
	} else if err != nil {
		return
	}

	logger := gosteno.NewLogger("bbs")
	logError := func(task models.Task, message string) {
		logger.Errord(map[string]interface{}{
			"runonce": task,
		}, message)
	}

	keysToDelete := []string{}
	unclaimedTimeoutBoundary := self.timeProvider.Time().Add(-timeToClaim).UnixNano()

	tasksToCAS := [][]models.Task{}
	scheduleForCAS := func(oldTask, newTask models.Task) {
		tasksToCAS = append(tasksToCAS, []models.Task{
			oldTask,
			newTask,
		})
	}

	for _, node := range taskState.ChildNodes {
		task, err := models.NewTaskFromJSON(node.Value)
		if err != nil {
			logger.Errord(map[string]interface{}{
				"key":   node.Key,
				"value": string(node.Value),
			}, "runonce.converge.json-parse-failure")
			keysToDelete = append(keysToDelete, node.Key)
			continue
		}

		switch task.State {
		case models.TaskStatePending:
			if task.CreatedAt <= unclaimedTimeoutBoundary {
				logError(task, "runonce.converge.failed-to-claim")
				scheduleForCAS(task, markTaskFailed(task, "not claimed within time limit"))
			} else {
				go self.kicker.Desire(&task)
			}
		case models.TaskStateClaimed:
			claimedTooLong := self.timeProvider.Time().Sub(time.Unix(0, task.UpdatedAt)) >= 30*time.Second
			_, executorIsAlive := executorState.Lookup(task.ExecutorID)

			if !executorIsAlive {
				logError(task, "runonce.converge.executor-disappeared")
				scheduleForCAS(task, markTaskFailed(task, "executor disappeared before completion"))
			} else if claimedTooLong {
				logError(task, "runonce.converge.failed-to-start")
				scheduleForCAS(task, demoteToPending(task))
			}
		case models.TaskStateRunning:
			_, executorIsAlive := executorState.Lookup(task.ExecutorID)

			if !executorIsAlive {
				logError(task, "runonce.converge.executor-disappeared")
				scheduleForCAS(task, markTaskFailed(task, "executor disappeared before completion"))
			}
		case models.TaskStateCompleted:
			go self.kicker.Complete(&task)
		case models.TaskStateResolving:
			resolvingTooLong := self.timeProvider.Time().Sub(time.Unix(0, task.UpdatedAt)) >= 30*time.Second

			if resolvingTooLong {
				logError(task, "runonce.converge.failed-to-resolve")
				scheduleForCAS(task, demoteToCompleted(task))
			}
		}
	}

	self.batchCompareAndSwapTasks(tasksToCAS, logger)
	self.store.Delete(keysToDelete...)
}