func (s *Supervisor) FailUnfinishedTasks() error { tasks, err := s.Database.GetAllTasks( &db.TaskFilter{ ForStatus: db.RunningStatus, }, ) if err != nil { return fmt.Errorf("Failed to sweep database of running tasks: %s", err) } now := time.Now() for _, task := range tasks { log.Warnf("Found task %s in 'running' state at startup; setting to 'failed'", task.UUID) if err := s.Database.FailTask(task.UUID, now); err != nil { return fmt.Errorf("Failed to sweep database of running tasks [%s]: %s", task.UUID, err) } if task.Op == db.BackupOperation && task.ArchiveUUID != nil { archive, err := s.Database.GetArchive(task.ArchiveUUID) if err != nil { log.Warnf("Unable to retrieve archive %s (for task %s) from the database: %s", task.ArchiveUUID, task.UUID, err) continue } log.Warnf("Found archive %s for task %s, purging", archive.UUID, task.UUID) task, err := s.Database.CreatePurgeTask("", archive, s.PurgeAgent) if err != nil { log.Errorf("Failed to purge archive %s (for task %s, which was running at boot): %s", archive.UUID, task.UUID, err) } else { s.ScheduleTask(task) } } } return nil }
func (s *Supervisor) ReschedulePendingTasks() error { tasks, err := s.Database.GetAllTasks( &db.TaskFilter{ ForStatus: db.PendingStatus, }, ) if err != nil { return fmt.Errorf("Failed to sweep database of pending tasks: %s", err) } for _, task := range tasks { log.Warnf("Found task %s in 'pending' state at startup; rescheduling", task.UUID) s.ScheduleTask(task) } return nil }
func (s *Supervisor) Run() error { if err := s.Database.Connect(); err != nil { return fmt.Errorf("failed to connect to %s database at %s: %s\n", s.Database.Driver, s.Database.DSN, err) } if err := s.Database.CheckCurrentSchema(); err != nil { return fmt.Errorf("database failed schema version check: %s\n", err) } if err := s.Resync(); err != nil { return err } if err := s.FailUnfinishedTasks(); err != nil { return err } if err := s.ReschedulePendingTasks(); err != nil { return err } for { select { case <-s.resync: if err := s.Resync(); err != nil { log.Errorf("resync error: %s", err) } case <-s.purge.C: s.PurgeArchives() case <-s.tick.C: s.CheckSchedule() // see if any tasks have been running past the timeout period if len(s.runq) > 0 { ok := true lst := make([]*db.Task, 0) now := timestamp.Now() for _, runtask := range s.runq { if now.After(runtask.TimeoutAt) { s.Database.CancelTask(runtask.UUID, now.Time()) log.Errorf("shield timed out task '%s' after running for %v", runtask.UUID, s.Timeout) ok = false } else { lst = append(lst, runtask) } } if !ok { s.runq = lst } } // see if we have anything in the schedule queue SchedQueue: for len(s.schedq) > 0 { select { case s.workers <- s.schedq[0]: s.Database.StartTask(s.schedq[0].UUID, time.Now()) s.schedq[0].Attempts++ log.Infof("sent a task to a worker") s.runq = append(s.runq, s.schedq[0]) log.Debugf("added task to the runq") s.schedq = s.schedq[1:] default: break SchedQueue } } case adhoc := <-s.adhoc: s.ScheduleAdhoc(adhoc) case u := <-s.updates: switch u.Op { case STOPPED: log.Infof(" %s: job stopped at %s", u.Task, u.StoppedAt) s.RemoveTaskFromRunq(u.Task) if err := s.Database.CompleteTask(u.Task, u.StoppedAt); err != nil { log.Errorf(" %s: !! failed to update database - %s", u.Task, err) } case FAILED: log.Warnf(" %s: task failed!", u.Task) s.RemoveTaskFromRunq(u.Task) if err := s.Database.FailTask(u.Task, u.StoppedAt); err != nil { log.Errorf(" %s: !! failed to update database - %s", u.Task, err) } case OUTPUT: log.Infof(" %s> %s", u.Task, strings.Trim(u.Output, "\n")) if err := s.Database.UpdateTaskLog(u.Task, u.Output); err != nil { log.Errorf(" %s: !! failed to update database - %s", u.Task, err) } case RESTORE_KEY: log.Infof(" %s: restore key is %s", u.Task, u.Output) if id, err := s.Database.CreateTaskArchive(u.Task, u.Output, time.Now()); err != nil { log.Errorf(" %s: !! failed to update database - %s", u.Task, err) } else { if !u.TaskSuccess { s.Database.InvalidateArchive(id) } } case PURGE_ARCHIVE: log.Infof(" %s: archive %s purged from storage", u.Task, u.Archive) if err := s.Database.PurgeArchive(u.Archive); err != nil { log.Errorf(" %s: !! failed to update database - %s", u.Task, err) } default: log.Errorf(" %s: !! unrecognized op type", u.Task) } } } }