func Setup(config common.FullConfig) { isDevelServer = config.GetIsDevel() dispatchThreads.v = make(map[string]map[string]*DispatcherData) launcherThreads.v = make(map[string]*LauncherData) killerThreads.v = make(map[string]map[string]bool) throttle.c = make(chan bool, THROTTLE_CHAN_CAPACITY) throttle.setIntervalCh = make(chan time.Duration, 1) go throttleThread() rusageInfo.groupsMaxParrots = make(map[string]uint64) rusageInfo.groupHosts = make(map[string][]string) rusageInfo.hostsInfo = make(map[string]*ServerInfo) rusageInfo.loadEstimate = make(map[string]*ScriptRusageEntry) rusageInfo.timetableRusage = make(map[uint64]*ScriptRusageEntry) rusageInfo.groupIdx = make(map[string]uint64) def := config.GetDefault() defaultParasiteMemory = def.GetParasiteMemory() defaultMinIdleCpu = def.GetMinIdleCpu() defaultMinMemory = def.GetMinMemory() defaultMinMemoryRatio = def.GetMinMemoryRatio() defaultMaxMemory = def.GetMaxMemory() defaultRusage = def.GetRusage() cycleMs = config.GetCycleMs() ttReloadIntervalMs = config.GetFullTimetableReloadIntervalMs() autoIncrementIncrement = getAutoIncrementIncrement() launcherConf := config.GetLauncher() hostSuffix = launcherConf.GetHostSuffix() basePath = launcherConf.GetBasePath() if launcherConf.DeveloperPath != nil { haveDeveloper = true developerPath = launcherConf.GetDeveloperPath() log.Printf("We have developer dir: %s", developerPath) } log.Printf("Updating hosts") updateHosts() log.Printf("Launching update hosts thread") go updateHostsThread() log.Printf("Clearing old heartbeats") if err := clearOldHeartbeats(); err != nil { log.Fatalf("Could not clear old heartbeats: %s", err.Error()) } log.Printf("Launching periodic run queue select thread") go runqueuePeriodicSelectThread() go forceCheckDeletedThread() }
func (d *DispatcherData) checkZero(src string) { if len(d.addedMap) == 0 && d.waitingList.Len() == 0 { log.Debugf("No rows left in class=%s, location=%s (%s)", d.className, d.location, src) trigger(d.zeroTTCh, "zerott") if d.killRequest != nil { log.Printf("Killed all jobs in class=%s, location=%s, waiting on continue channel", d.className, d.location) d.killRequest.ResCh <- nil d.killRequest = nil log.Printf("Can continue dispatching in class=%s, location=%s", d.className, d.location) } } }
func (d *LauncherData) updateStatus(row *RunQueueEntry, newStatus string) { if row.RunStatus == newStatus { return } if LAUNCHER_DB_DEBUG { log.Printf("Updating status of row #%d (%s) from %s to %s", row.Id, row.ClassName, row.RunStatus, newStatus) } if old := d.getMapByStatus(row.RunStatus); old != nil { delete(old, row.Id) } if m := d.getMapByStatus(newStatus); m != nil { m[row.Id] = row } row.RunStatus = newStatus now := time.Now().Unix() if newStatus == RUN_STATUS_WAITING { row.waiting_ts.Valid = true row.waiting_ts.Int64 = now } else if newStatus == RUN_STATUS_INIT { row.init_ts.Valid = true row.init_ts.Int64 = now } else if newStatus == RUN_STATUS_RUNNING { row.running_ts.Valid = true row.running_ts.Int64 = now } else if newStatus == RUN_STATUS_FINISHED { row.running_ts.Valid = true row.running_ts.Int64 = now } }
func (d *LauncherData) addToMaps(row *RunQueueEntry) { if row.settings == nil { buf := make([]byte, 5000) n := runtime.Stack(buf, false) log.Warningf("Incorrect row in run queue (addToMaps), settings are invalid: %+v\ntrace:%s", row, buf[0:n]) return } if m := d.getMapByStatus(row.RunStatus); m != nil { if LAUNCHER_DB_DEBUG { log.Printf("RQ row from db: id=%d, class=%s, job_data=%s, hostname=%s", row.Id, row.ClassName, row.JobData, row.hostname) } if d.allMap[row.Id] != nil { log.Warnf("Trying to add already added into run_queue (all map): %+v", row) return } if m[row.Id] != nil { log.Warnf("Trying to add already added into run_queue (own map): %+v", row) return } m[row.Id] = row d.allMap[row.Id] = row } else { log.Warnf("Broken run status: %+v", row) } }
func printFreeResources(infoMap map[string]*ServerInfo) { spisok := make([]string, 0, len(infoMap)) for hostname := range infoMap { spisok = append(spisok, hostname) } sort.Strings(spisok) for _, hostname := range spisok { row := infoMap[hostname] log.Printf("freeresources: host: %-5s serverInfo:%s", hostname, row) } }
func loadSettingsFromRows(jiRows map[string]map[string]*JobInfoEntry, scripts map[string]*ScriptEntry) error { newIds := make(map[uint64]bool) jiRowsCnt := 0 func() { allSettingsMutex.Lock() defer allSettingsMutex.Unlock() for _, row := range scripts { if _, ok := allSettings[row.settings_id]; !ok { newIds[row.settings_id] = true } } for _, rows := range jiRows { for _, row := range rows { jiRowsCnt++ if _, ok := allSettings[row.settings_id]; !ok { newIds[row.settings_id] = true } } } }() if len(newIds) > 0 { loadNewIdsTs := time.Now().UnixNano() err := loadNewIds(newIds) if err != nil { return err } log.Printf("Loaded %d new ids for %.5f sec", len(newIds), float64(time.Now().UnixNano()-loadNewIdsTs)/1e9) } log.Debugln(" Selected", len(scripts), "rows from scripts") log.Debugln(" Selected", jiRowsCnt, "rows from job info") return nil }
func startKilling(className string) { killerThreads.Lock() defer killerThreads.Unlock() dispatchThreads.Lock() defer dispatchThreads.Unlock() locs, ok := dispatchThreads.v[className] if !ok { return } killMap := killerThreads.v[className] if killMap == nil { killMap = make(map[string]bool) killerThreads.v[className] = killMap } for loc, dt := range locs { if killMap[loc] { continue } req := &KillRequest{ ResCh: make(chan error, 1), } log.Printf("Sending kill request to class=%s, location=%s", className, loc) select { case dt.killRequestCh <- req: killMap[loc] = true default: log.Warnf("Could not send kill request to class=%s, location=%s, kill channel was busy", className, loc) } } }
// haveTTRows must be nil if there are no timetable entries for any location // otherwise it must have only true entries like map["location"] => true // probably jobs generation can be simplified, it is just the way it is func generateJobs(tx *db.LazyTrx, className string, settings *ScriptSettings, jiRows map[string]*JobInfoEntry, haveTTRows map[string]bool, flags *FlagEntry) (add_to_timetable []*TimetableEntry, err error) { if haveTTRows != nil && len(haveTTRows) == 0 { haveTTRows = nil } now := time.Now().Unix() add_to_timetable = make([]*TimetableEntry, 0) add_job_info := make([]*JobInfoEntry, 0) set_finish_jobs := make([]string, 0) set_init_jobs := make([]string, 0) set_jobs_generated_js := make([]string, 0) prepare_next_generation := make([]NextGenParams, 0) have_finish_jobs := settings.jobs.Have_finish_jobs is_any := (settings.location_type == LOCATION_TYPE_ANY) is_temporary := settings.jobs.Temporary temporary_can_run := false if flags != nil { if flags.kill_requested_ts.Valid { is_done := (haveTTRows == nil) if is_done { log.Printf("Class %s is done, all is ok", className) if !flags.killed_ts.Valid { tx.AddCommitCallback(func() { continueDispatchAfterKill(className) }) if err = setKilledFlag(tx, className); err != nil { return } if err = prepareNextGeneration(tx, have_finish_jobs, className, settings); err != nil { return } } } else { log.Printf("Class %s is not done", className) startKilling(className) // not the best place to put it, but it works if err = setMaxFinishedTs(tx, className, flags.kill_request_employee_id.Int64, flags.kill_requested_ts.Int64); err != nil { return } } return } // Stop generating new job generations when we are on pause if flags.pause_requested_ts.Valid { is_done := generationFinished(className, haveTTRows, jiRows, settings) if is_done && !flags.paused_ts.Valid { if err = setPausedFlag(tx, className); err != nil { return } flags.paused_ts = sql.NullInt64{Int64: now, Valid: true} } if !is_any || flags.paused_ts.Valid { return } } if is_temporary && flags.run_requested_ts.Valid && is_any { // We accepted run request, which means that we already generated jobs if flags.run_accepted_ts.Valid { if generationFinished(className, haveTTRows, jiRows, settings) { if err = resetRunRequest(tx, className); err != nil { return } if err = prepareNextGeneration(tx, have_finish_jobs, className, settings); err != nil { return } return } } else { if err = setRunAccepted(tx, className); err != nil { return } } temporary_can_run = true } } if is_temporary && !temporary_can_run || settings.jobs.Type == JOBS_TYPE_NONE { return } locations := make([]string, 0) if !is_any { all_locations := getLocations(settings) timetable_locations := make(map[string]bool) if haveTTRows != nil { for location, _ := range haveTTRows { timetable_locations[location] = true } } // there can be failed hosts that are still running: we must really compare host names, not just counts for _, loc := range all_locations { if _, ok := timetable_locations[loc]; !ok { locations = append(locations, loc) } } if len(locations) == 0 { return } } else { if haveTTRows != nil && len(haveTTRows) > 0 { return } locations = getLocations(settings) } tt_location_type := LOCATION_TYPE_EACH if is_any { tt_location_type = LOCATION_TYPE_ANY } for _, location := range locations { job_info_key, gliErr := getLocationIdx(tt_location_type, location) if gliErr != nil { log.Warningf("Error getting location index for %s for location_type %s and location %s: %s", className, tt_location_type, location, gliErr.Error()) continue } var row *JobInfoEntry if jiRows == nil || jiRows[job_info_key] == nil { row = &JobInfoEntry{generation_id: 0, class_name: className, location: job_info_key, next_generate_job_ts: sql.NullInt64{Int64: int64(getNextJobGenerateTs(className, true, 0, settings)), Valid: true}, settings_id: settings.id} add_job_info = append(add_job_info, row) } else { row = jiRows[job_info_key] } tt_row := &TimetableEntry{ class_name: className, default_retry: settings.retry_job, repeat: settings.repeat_job, method: METHOD_RUN, finished_successfully: 0, generation_id: sql.NullInt64{Int64: int64(row.generation_id), Valid: true}, settings_id: row.settings_id, location: location, created: uint64(now), } tt_row.NextLaunchTs.Valid = true tt_row.NextLaunchTs.Int64 = now if row.jobs_generated_ts.Valid || row.init_jobs_ts.Valid { if have_finish_jobs && !row.finish_jobs_ts.Valid { set_finish_jobs = append(set_finish_jobs, job_info_key) tt_row.JobData = `"finishJobs"` tt_row.method = METHOD_FINISH_JOBS tt_row.default_retry = settings.retry_job add_to_timetable = append(add_to_timetable, tt_row) } else { prepare_next_generation = append(prepare_next_generation, NextGenParams{Location: job_info_key, JobInfo: row}) } continue } else if row.next_generate_job_ts.Int64 > now { continue } if settings.jobs.Type == JOBS_TYPE_CUSTOM { set_init_jobs = append(set_init_jobs, job_info_key) tt_row.JobData = `"initJobs"` tt_row.method = METHOD_INIT_JOBS tt_row.default_retry = uint32(settings.retry.Int64) add_to_timetable = append(add_to_timetable, tt_row) continue } jobs, mjlErr := makeJobsList(settings.jobs, settings.instance_count, className) if mjlErr != nil { log.Warningf("Error generating jobs for %+v with instance_count=%d and jobs=%s: %s", className, settings.instance_count, settings.jobs, mjlErr.Error()) continue } for _, job := range jobs { tt_row_copy := new(TimetableEntry) *tt_row_copy = *tt_row tt_row_copy.JobData = job add_to_timetable = append(add_to_timetable, tt_row_copy) } set_jobs_generated_js = append(set_jobs_generated_js, job_info_key) } if err = addJobInfo(tx, add_job_info); err != nil { return } if err = setFinishJobsTs(tx, className, set_finish_jobs); err != nil { return } if err = batchPrepareNextGeneration(tx, have_finish_jobs, className, prepare_next_generation, settings); err != nil { return } if err = setInitJobsTs(tx, className, set_init_jobs); err != nil { return } if err = setJobsGeneratedTs(tx, className, set_jobs_generated_js); err != nil { return } if err = addToTimetable(tx, add_to_timetable); err != nil { return } return }