Example #1
0
func (s *Supervisor) FailUnfinishedTasks() error {
	tasks, err := s.Database.GetAllTasks(
		&db.TaskFilter{
			ForStatus: db.RunningStatus,
		},
	)
	if err != nil {
		return fmt.Errorf("Failed to sweep database of running tasks: %s", err)
	}

	now := time.Now()
	for _, task := range tasks {
		log.Warnf("Found task %s in 'running' state at startup; setting to 'failed'", task.UUID)
		if err := s.Database.FailTask(task.UUID, now); err != nil {
			return fmt.Errorf("Failed to sweep database of running tasks [%s]: %s", task.UUID, err)
		}
		if task.Op == db.BackupOperation && task.ArchiveUUID != nil {
			archive, err := s.Database.GetArchive(task.ArchiveUUID)
			if err != nil {
				log.Warnf("Unable to retrieve archive %s (for task %s) from the database: %s",
					task.ArchiveUUID, task.UUID, err)
				continue
			}
			log.Warnf("Found archive %s for task %s, purging", archive.UUID, task.UUID)
			task, err := s.Database.CreatePurgeTask("", archive, s.PurgeAgent)
			if err != nil {
				log.Errorf("Failed to purge archive %s (for task %s, which was running at boot): %s",
					archive.UUID, task.UUID, err)
			} else {
				s.ScheduleTask(task)
			}
		}
	}

	return nil
}
Example #2
0
func (s *Supervisor) ReschedulePendingTasks() error {
	tasks, err := s.Database.GetAllTasks(
		&db.TaskFilter{
			ForStatus: db.PendingStatus,
		},
	)
	if err != nil {
		return fmt.Errorf("Failed to sweep database of pending tasks: %s", err)
	}

	for _, task := range tasks {
		log.Warnf("Found task %s in 'pending' state at startup; rescheduling", task.UUID)
		s.ScheduleTask(task)
	}

	return nil
}
Example #3
0
func (s *Supervisor) Run() error {
	if err := s.Database.Connect(); err != nil {
		return fmt.Errorf("failed to connect to %s database at %s: %s\n",
			s.Database.Driver, s.Database.DSN, err)
	}

	if err := s.Database.CheckCurrentSchema(); err != nil {
		return fmt.Errorf("database failed schema version check: %s\n", err)
	}

	if err := s.Resync(); err != nil {
		return err
	}
	if err := s.FailUnfinishedTasks(); err != nil {
		return err
	}
	if err := s.ReschedulePendingTasks(); err != nil {
		return err
	}

	for {
		select {
		case <-s.resync:
			if err := s.Resync(); err != nil {
				log.Errorf("resync error: %s", err)
			}

		case <-s.purge.C:
			s.PurgeArchives()

		case <-s.tick.C:
			s.CheckSchedule()

			// see if any tasks have been running past the timeout period
			if len(s.runq) > 0 {
				ok := true
				lst := make([]*db.Task, 0)
				now := timestamp.Now()

				for _, runtask := range s.runq {
					if now.After(runtask.TimeoutAt) {
						s.Database.CancelTask(runtask.UUID, now.Time())
						log.Errorf("shield timed out task '%s' after running for %v", runtask.UUID, s.Timeout)
						ok = false

					} else {
						lst = append(lst, runtask)
					}
				}

				if !ok {
					s.runq = lst
				}
			}

			// see if we have anything in the schedule queue
		SchedQueue:
			for len(s.schedq) > 0 {
				select {
				case s.workers <- s.schedq[0]:
					s.Database.StartTask(s.schedq[0].UUID, time.Now())
					s.schedq[0].Attempts++
					log.Infof("sent a task to a worker")
					s.runq = append(s.runq, s.schedq[0])
					log.Debugf("added task to the runq")
					s.schedq = s.schedq[1:]
				default:
					break SchedQueue
				}
			}

		case adhoc := <-s.adhoc:
			s.ScheduleAdhoc(adhoc)

		case u := <-s.updates:
			switch u.Op {
			case STOPPED:
				log.Infof("  %s: job stopped at %s", u.Task, u.StoppedAt)
				s.RemoveTaskFromRunq(u.Task)
				if err := s.Database.CompleteTask(u.Task, u.StoppedAt); err != nil {
					log.Errorf("  %s: !! failed to update database - %s", u.Task, err)
				}

			case FAILED:
				log.Warnf("  %s: task failed!", u.Task)
				s.RemoveTaskFromRunq(u.Task)
				if err := s.Database.FailTask(u.Task, u.StoppedAt); err != nil {
					log.Errorf("  %s: !! failed to update database - %s", u.Task, err)
				}

			case OUTPUT:
				log.Infof("  %s> %s", u.Task, strings.Trim(u.Output, "\n"))
				if err := s.Database.UpdateTaskLog(u.Task, u.Output); err != nil {
					log.Errorf("  %s: !! failed to update database - %s", u.Task, err)
				}

			case RESTORE_KEY:
				log.Infof("  %s: restore key is %s", u.Task, u.Output)
				if id, err := s.Database.CreateTaskArchive(u.Task, u.Output, time.Now()); err != nil {
					log.Errorf("  %s: !! failed to update database - %s", u.Task, err)
				} else {
					if !u.TaskSuccess {
						s.Database.InvalidateArchive(id)
					}
				}

			case PURGE_ARCHIVE:
				log.Infof("  %s: archive %s purged from storage", u.Task, u.Archive)
				if err := s.Database.PurgeArchive(u.Archive); err != nil {
					log.Errorf("  %s: !! failed to update database - %s", u.Task, err)
				}

			default:
				log.Errorf("  %s: !! unrecognized op type", u.Task)
			}
		}
	}
}