func (d *LauncherData) persistFinishAndNotify(row *RunQueueEntry, success bool, prevStatus string) error { location, err := getLocationIdx(row.settings.location_type, d.hostname) if err != nil { log.Warningf("Could not get location idx for row %+v, settings: %+v, reason: %s", row, row.settings, err.Error()) return err } if row.timetable_id.Int64 == 0 { err := db.DoInLazyTransaction(func(tx *db.LazyTrx) error { return deleteFromRunQueue(tx, []uint64{row.Id}, prevStatus) }) if err != nil { log.Warningf("Could not delete incorrectly finished run queue entry in %+v: %s", row, err.Error()) return err } } else { if err := notifyTTFinished(row.ClassName, location, uint64(row.timetable_id.Int64), row.Id, success, true, prevStatus); err != nil { log.Warningf("Could not notify about timetable finish: %s", err.Error()) return err } } return nil }
func (d *LauncherData) processWaitingInInit() { for run_id, cachedRow := range d.initMap { _, rawResp, err := d.call(&badoo_phproxyd.RequestCheck{Hash: proto.Uint64(run_id)}) if shouldHaveFinished(cachedRow) { d.logIncorrectFinish(nil, cachedRow, RUN_STATUS_INIT) continue } if err != nil { continue } row, err := getRunningInfo(run_id) if err != nil { log.Warnf("Could not get running info from DB in method processWaitingInInit for hostname=%s, class=%s, run id=%d, err: %s", d.hostname, cachedRow.ClassName, run_id, err.Error()) continue } if cachedRow.settings_id != row.settings_id { log.Warnf("Broken row in cache or db for id=%d, settings_id is different (cache=%d, db=%d)", run_id, cachedRow.settings_id, row.settings_id) continue } row.settings = cachedRow.settings switch resp := rawResp.(type) { case *badoo_phproxyd.ResponseCheck: d.logIncorrectFinish(resp, row, row.RunStatus) case *badoo_phproxyd.ResponseGeneric: result := -badoo_phproxyd.Errno(resp.GetErrorCode()) if result == badoo_phproxyd.Errno_ERRNO_NOT_FOUND || result == badoo_phproxyd.Errno_ERRNO_ALREADY_RUNNING { // we should log intelligently when server is so slow that it does not launch script in 0.5 sec if time.Now().Unix()-INIT_TIMEOUT_SEC > row.should_init_ts.Int64 { action := KILL_ACTION_NO_ACTION if result == badoo_phproxyd.Errno_ERRNO_NOT_FOUND { action = KILL_ACTION_LOG_SCRIPT_FINISH_INIT } d.terminate(row, action) } else if result == badoo_phproxyd.Errno_ERRNO_NOT_FOUND { d.terminate(row, KILL_ACTION_SET_WAITING) } } else if result == badoo_phproxyd.Errno_ERRNO_FAILED_FINISHED { log.Warningf("Script %s finished with failure at %s", row.ClassName, d.hostname) d.logIncorrectFinish(nil, row, RUN_STATUS_INIT) } else if result == badoo_phproxyd.Errno_ERRNO_WAIT_FOR_FREE { log.Warningf("Waiting in init: Lost results for %s at %s", row.ClassName, d.hostname) d.logIncorrectFinish(nil, row, RUN_STATUS_INIT) } else { log.Warningf("Unexpected return code %d (%s) for check request for class=%s, hostname=%s", result, result, row.ClassName, d.hostname) } default: log.Warningf("Received unexpected result from phproxyd at %s: type %T, result: %v", d.hostname, rawResp, rawResp) } } }
func (d *LauncherData) processAllRows(rows []*RunQueueEntry) { for _, row := range rows { if row.settings == nil { log.Warningf("Incorrect row in run queue, settings are invalid: %+v", row) continue } el, ok := d.allMap[row.Id] if !ok { if d.deletedIds[row.Id] == 0 { d.addToMaps(row) } continue } if statusPriority[row.RunStatus] > statusPriority[el.RunStatus] || row.RunStatus == RUN_STATUS_WAITING && el.RunStatus == RUN_STATUS_INIT && row.init_attempts > el.init_attempts { d.updateStatus(el, row.RunStatus) *el = *row } // external kill request has come if row.max_finished_ts.Valid && row.max_finished_ts.Int64 < el.max_finished_ts.Int64 { el.max_finished_ts = row.max_finished_ts el.stopped_employee_id = row.stopped_employee_id } } for runId, refCount := range d.deletedIds { if refCount--; refCount <= 0 { delete(d.deletedIds, runId) } else { d.deletedIds[runId] = refCount } } }
func (d *LauncherData) acceptWaiting(jobs []*RunQueueEntry) { idsToSelect := make([]uint64, 0) settingsMap := make(map[uint64]*ScriptSettings) for _, row := range jobs { if row.settings == nil { log.Warningf("Incorrect row in run queue (waitingCh), settings are invalid: %+v", row) continue } if d.allMap[row.Id] != nil { continue } settingsMap[row.Id] = row.settings idsToSelect = append(idsToSelect, row.Id) } rqs, err := getRunningInfos(idsToSelect) if err != nil { log.Warnf("acceptWaiting could not select run_queue entries: %s", err.Error()) return } for _, row := range rqs { row.settings = settingsMap[row.Id] d.addToMaps(row) } d.processWaiting() }
func (d *DispatcherData) processNewJobsEv(ev *NewJobs) { ids := make([]uint64, 0, len(ev.rows)) settingsMap := make(map[uint64]*ScriptSettings) for _, row := range ev.rows { if row.settings == nil { log.Warningf("Incorrect row in timetable (processNewJobsEv), settings are invalid: %+v", row) continue } if d.waitingMap[row.id] != nil || d.addedMap[row.id] != nil { continue } settingsMap[row.id] = row.settings ids = append(ids, row.id) } rows, err := selectTimetableByIds(ids) if err != nil { log.Warnf("could not select tt_enties for ids:%+v err:%s", ids, err.Error()) return } for _, row := range rows { row.settings = settingsMap[row.id] } d.acceptNewJobs(rows) }
func (d *LauncherData) addToMaps(row *RunQueueEntry) { if row.settings == nil { buf := make([]byte, 5000) n := runtime.Stack(buf, false) log.Warningf("Incorrect row in run queue (addToMaps), settings are invalid: %+v\ntrace:%s", row, buf[0:n]) return } if m := d.getMapByStatus(row.RunStatus); m != nil { if LAUNCHER_DB_DEBUG { log.Printf("RQ row from db: id=%d, class=%s, job_data=%s, hostname=%s", row.Id, row.ClassName, row.JobData, row.hostname) } if d.allMap[row.Id] != nil { log.Warnf("Trying to add already added into run_queue (all map): %+v", row) return } if m[row.Id] != nil { log.Warnf("Trying to add already added into run_queue (own map): %+v", row) return } m[row.Id] = row d.allMap[row.Id] = row } else { log.Warnf("Broken run status: %+v", row) } }
func (d *LauncherData) call(req proto.Message) (msg_id uint32, resp proto.Message, err error) { client := gpbrpc.NewClient(d.hostname+hostSuffix, &badoo_phproxyd.Gpbrpc, &gpbrpc.GpbsCodec, time.Second, time.Second) defer client.Close() msg_id, resp, err = client.Call(req) if err != nil { log.Warningf("Call failed for host %s for message %+v, got error: %s", d.hostname, req, err.Error()) } return }
func GenerateJobsCycle() { hostname, err := os.Hostname() if err != nil { log.Fatalf("Could not get hostname: %s", err.Error()) } log.Print("Initial select from RunQueue and starting launcher goroutines") if err := selectRQAndNotify(); err != nil { log.Fatalf("Could not do initial select run queue: %s", err.Error()) } log.Print("Initial select from Timetable") ttRows, err := selectTimetable() if err != nil { log.Fatalf("Could not do initial select timetable: %s", err.Error()) } log.Print("Starting jobgen goroutines") if err := notifyForFullTTSelect(ttRows, false); err != nil { log.Fatalf("Could notify about timetable: %s", err.Error()) } for { res, err := db.LockCycle(getLockName(), hostname) if err != nil || !res { if err == nil { log.Println("Could not get lock, another host holds it? Retrying in 10 seconds") } else { log.Warningf("Could not get lock, got DB error: ", err.Error()) } time.Sleep(time.Second * 10) continue } // timer := pinba.TimerStart(map[string]string{"group": "jobgenerator"}) startTs := time.Now().UnixNano() db.LogCycleStart(CYCLE_CLASS_NAME, hostname, 0) log.Debug("Cycle started") success := doCycle() log.Debug("Cycle finished") successInt := 1 if !success { successInt = 0 } db.LogCycleStop(CYCLE_CLASS_NAME, hostname, 0, successInt) passedMs := int64((time.Now().UnixNano() - startTs) / 1e6) if passedMs < cycleMs { time.Sleep(time.Duration(cycleMs-passedMs) * time.Millisecond) } } }
func runqueuePeriodicSelectThread() { timer := time.Tick(time.Millisecond * time.Duration(ttReloadIntervalMs)) for { <-timer if err := selectRQAndNotify(); err != nil { log.Warningf("Could not perform periodic timetable select: %s", err.Error()) } } }
// select everything from timetable, notify dispatcher threads and start them if needed func notifyForFullTTSelect(classLocTTRows map[string]map[string][]*TimetableEntry, isExisting bool) error { settingsIds := make(map[uint64]bool) for _, locTTRows := range classLocTTRows { for _, ttRows := range locTTRows { for _, row := range ttRows { settingsIds[row.settings_id] = true } } } err := loadNewIds(settingsIds) if err != nil { return err } for className, locTTRows := range classLocTTRows { // TODO: ensure that "location" is always the same for "any" type of script, otherwise goroutine will receive // multiple notifications for location, ttRows := range locTTRows { anyRows := make([]*TimetableEntry, 0, len(locTTRows)) eachRows := make([]*TimetableEntry, 0, len(locTTRows)) allSettingsMutex.Lock() for _, row := range ttRows { row.settings = allSettings[row.settings_id] } allSettingsMutex.Unlock() for _, row := range ttRows { if row.settings == nil { log.Warningf("Incorrect row in timetable, settings are invalid: %+v", row) continue } if row.settings.location_type == LOCATION_TYPE_EACH { eachRows = append(eachRows, row) } else { anyRows = append(anyRows, row) } } if len(anyRows) > 0 { notifyAboutNewTTRows(className, DEFAULT_LOCATION_IDX, anyRows, isExisting) } if len(eachRows) > 0 { notifyAboutNewTTRows(className, location, eachRows, isExisting) } } } return nil }
func (d *LauncherData) processRunningTooLong() { now := time.Now().Unix() for run_id, row := range d.runningMap { if !row.max_finished_ts.Valid || row.max_finished_ts.Int64 >= now { continue } if shouldHaveFinished(row) { d.logIncorrectFinish(nil, row, RUN_STATUS_RUNNING) continue } _, rawResp, err := d.call(&badoo_phproxyd.RequestCheck{Hash: proto.Uint64(run_id)}) if err != nil { log.Warningf("Could not call check at hostname %s: %s", d.hostname, err.Error()) continue } switch resp := rawResp.(type) { case *badoo_phproxyd.ResponseCheck: d.logIncorrectFinish(resp, row, RUN_STATUS_RUNNING) case *badoo_phproxyd.ResponseGeneric: code := -badoo_phproxyd.Errno(resp.GetErrorCode()) if code == badoo_phproxyd.Errno_ERRNO_ALREADY_RUNNING { d.terminate(row, KILL_ACTION_NO_ACTION) } else if code == badoo_phproxyd.Errno_ERRNO_WAIT_FOR_FREE { d.logIncorrectFinish(nil, row, RUN_STATUS_RUNNING) } else if code == badoo_phproxyd.Errno_ERRNO_NOT_FOUND { d.terminate(row, KILL_ACTION_LOG_SCRIPT_FINISH_RUNNING) } else { log.Warningf("Unexpected error code %d (%s) from phproxyd at %s (process running too long)", code, resp.GetErrorText(), d.hostname) } default: log.Warningf("Received unexpected result from phproxyd at %s: type %T, result: %v", d.hostname, rawResp, rawResp) } } }
func (p *LazyTrx) Exec(queryTpl string, args ...interface{}) (sql.Result, error) { q, err := p.prepareFirstQuery(queryTpl, args...) if err != nil { return nil, err } res, err := p.tx.Exec(q) if err != nil { trace := make([]byte, 8192) n := runtime.Stack(trace, false) log.Warningf("Failed SQL query:\n'%s',\n\nReason: '%s',\n\nStack trace: %s\n", q, err.Error(), trace[0:n]) } return res, err }
func (d *DispatcherData) acceptNewJobs(jobs []*TimetableEntry) { for _, row := range jobs { if row.settings == nil { buf := make([]byte, 5000) n := runtime.Stack(buf, false) log.Warningf("Incorrect row in timetable (acceptNewJobs), settings are invalid: %+v\ntrace:%s", row, buf[0:n]) continue } // actualization from DB if row.added_to_queue_ts.Valid { d.addToAdded(row) if row.finished_ts.Valid && row.finish_count > 0 { row.finish_count-- d.processFinished(&FinishEvent{timetable_id: row.id, success: row.finished_successfully != 0, errorCh: make(chan error, 1)}) } } else { d.addToWaiting(row) } } trigger(d.redispatchCh, "redispatch") }
func doCycle() bool { var ( jiRows map[string]map[string]*JobInfoEntry scripts map[string]*ScriptEntry flags map[string]*FlagEntry scriptsRusage map[string]*ScriptRusageEntry classLocTTRows map[string]map[string][]*TimetableEntry ) unifiedStartTs := time.Now().UnixNano() startTs := time.Now().UnixNano() err := loadFullState( &LoadStateFunc{name: "Scripts", fun: func() (err error) { scripts, err = getGroupedScriptsForPlatform(); return }}, &LoadStateFunc{name: "JobInfo", fun: func() (err error) { jiRows, err = getGroupedJobInfo(); return }}, &LoadStateFunc{name: "Flags", fun: func() (err error) { flags, err = getFlags(); return }}, &LoadStateFunc{name: "ScriptsRusage", fun: func() (err error) { scriptsRusage, err = getScriptRusageStats(); return }}, &LoadStateFunc{name: "ScriptTimetable", fun: func() (err error) { classLocTTRows, err = selectTimetable(); return }}) if err != nil { log.Errorf("Failed to select state in doCycle: %s", err.Error()) return false } log.Debugf("Loaded for %.5f sec", float64(time.Now().UnixNano()-startTs)/1e9) startTs = time.Now().UnixNano() err = loadSettingsFromRows(jiRows, scripts) if err != nil { log.Errorf("Could not load settings from rows: %s", err.Error()) return false } func() { allSettingsMutex.Lock() defer allSettingsMutex.Unlock() for _, row := range scripts { row.settings = allSettings[row.settings_id] } }() scriptsMap.Lock() scriptsMap.v = scripts scriptsMap.Unlock() log.Debugf(" Selected %d rows from flags", len(flags)) log.Debugf(" Selected %d rows from scripts rusage", len(scriptsRusage)) log.Debugf("Load settings for %.5f sec", float64(time.Now().UnixNano()-startTs)/1e9) startTs = time.Now().UnixNano() // We should not try to generate jobs for scripts that are not present in Script table // But we should not forget settings (e.g. last generation_id) for that script for class_name := range jiRows { if _, ok := scripts[class_name]; !ok { delete(jiRows, class_name) } } log.Debugf("Selected all for %.5f sec", float64(time.Now().UnixNano()-unifiedStartTs)/1e9) startTs = time.Now().UnixNano() updateLoadEstimates() log.Debugf("Load estimates updated for %.5f sec", float64(time.Now().UnixNano()-startTs)/1e9) func() { rusageInfo.Lock() defer rusageInfo.Unlock() log.Debugf("Group hosts: %+v", rusageInfo.groupHosts) }() startTs = time.Now().UnixNano() failedLocationsMutex.Lock() failedLocations = make(map[string]bool) failedLocationsMutex.Unlock() success := true if len(scripts) > 0 { throttle.setIntervalCh <- time.Second / time.Duration(len(scripts)) } trigger(throttle.c, "throttle, start of cycle") for className, script := range scripts { <-throttle.c tx := new(db.LazyTrx) err := tx.Begin() if err != nil { log.Errorf("Could not start transaction in job generate: %s", err.Error()) success = false continue } have := make(map[string]bool) locTtRows := classLocTTRows[className] if locTtRows != nil { for rawLoc, v := range locTtRows { loc, err := getLocationIdx(script.settings.location_type, rawLoc) if err != nil { log.Warningf("Broken settings for class %s: %s", className, err.Error()) loc = rawLoc } if len(v) > 0 { have[loc] = true } } } add_to_timetable, err := generateJobs(tx, className, script.settings, jiRows[className], have, flags[className]) if err != nil { log.Errorf("Could generate jobs for class %s: %s", className, err.Error()) tx.Rollback() success = false continue } err = tx.Commit() if err != nil { log.Errorf("Could not commit generate jobs for class %s: %s", className, err.Error()) success = false continue } per_location := make(map[string][]*TimetableEntry) for _, row := range add_to_timetable { allSettingsMutex.Lock() row.settings = allSettings[row.settings_id] allSettingsMutex.Unlock() if row.settings == nil { log.Warningf("Internal inconsistency error: Invalid settings for generated row: %+v", row) continue } key := DEFAULT_LOCATION_IDX if row.settings.location_type == LOCATION_TYPE_EACH { key = row.location } if _, ok := per_location[key]; !ok { per_location[key] = make([]*TimetableEntry, 0) } per_location[key] = append(per_location[key], row) } for location, rows := range per_location { notifyAboutNewTTRows(className, location, rows, true) } } notifyForFullTTSelect(classLocTTRows, true) log.Debugf("Processed %d classes for %.5f sec", len(scripts), float64(time.Now().UnixNano()-startTs)/1e9) log.Debugf("Total %.5f sec", float64(time.Now().UnixNano()-unifiedStartTs)/1e9) return success }
// haveTTRows must be nil if there are no timetable entries for any location // otherwise it must have only true entries like map["location"] => true // probably jobs generation can be simplified, it is just the way it is func generateJobs(tx *db.LazyTrx, className string, settings *ScriptSettings, jiRows map[string]*JobInfoEntry, haveTTRows map[string]bool, flags *FlagEntry) (add_to_timetable []*TimetableEntry, err error) { if haveTTRows != nil && len(haveTTRows) == 0 { haveTTRows = nil } now := time.Now().Unix() add_to_timetable = make([]*TimetableEntry, 0) add_job_info := make([]*JobInfoEntry, 0) set_finish_jobs := make([]string, 0) set_init_jobs := make([]string, 0) set_jobs_generated_js := make([]string, 0) prepare_next_generation := make([]NextGenParams, 0) have_finish_jobs := settings.jobs.Have_finish_jobs is_any := (settings.location_type == LOCATION_TYPE_ANY) is_temporary := settings.jobs.Temporary temporary_can_run := false if flags != nil { if flags.kill_requested_ts.Valid { is_done := (haveTTRows == nil) if is_done { log.Printf("Class %s is done, all is ok", className) if !flags.killed_ts.Valid { tx.AddCommitCallback(func() { continueDispatchAfterKill(className) }) if err = setKilledFlag(tx, className); err != nil { return } if err = prepareNextGeneration(tx, have_finish_jobs, className, settings); err != nil { return } } } else { log.Printf("Class %s is not done", className) startKilling(className) // not the best place to put it, but it works if err = setMaxFinishedTs(tx, className, flags.kill_request_employee_id.Int64, flags.kill_requested_ts.Int64); err != nil { return } } return } // Stop generating new job generations when we are on pause if flags.pause_requested_ts.Valid { is_done := generationFinished(className, haveTTRows, jiRows, settings) if is_done && !flags.paused_ts.Valid { if err = setPausedFlag(tx, className); err != nil { return } flags.paused_ts = sql.NullInt64{Int64: now, Valid: true} } if !is_any || flags.paused_ts.Valid { return } } if is_temporary && flags.run_requested_ts.Valid && is_any { // We accepted run request, which means that we already generated jobs if flags.run_accepted_ts.Valid { if generationFinished(className, haveTTRows, jiRows, settings) { if err = resetRunRequest(tx, className); err != nil { return } if err = prepareNextGeneration(tx, have_finish_jobs, className, settings); err != nil { return } return } } else { if err = setRunAccepted(tx, className); err != nil { return } } temporary_can_run = true } } if is_temporary && !temporary_can_run || settings.jobs.Type == JOBS_TYPE_NONE { return } locations := make([]string, 0) if !is_any { all_locations := getLocations(settings) timetable_locations := make(map[string]bool) if haveTTRows != nil { for location, _ := range haveTTRows { timetable_locations[location] = true } } // there can be failed hosts that are still running: we must really compare host names, not just counts for _, loc := range all_locations { if _, ok := timetable_locations[loc]; !ok { locations = append(locations, loc) } } if len(locations) == 0 { return } } else { if haveTTRows != nil && len(haveTTRows) > 0 { return } locations = getLocations(settings) } tt_location_type := LOCATION_TYPE_EACH if is_any { tt_location_type = LOCATION_TYPE_ANY } for _, location := range locations { job_info_key, gliErr := getLocationIdx(tt_location_type, location) if gliErr != nil { log.Warningf("Error getting location index for %s for location_type %s and location %s: %s", className, tt_location_type, location, gliErr.Error()) continue } var row *JobInfoEntry if jiRows == nil || jiRows[job_info_key] == nil { row = &JobInfoEntry{generation_id: 0, class_name: className, location: job_info_key, next_generate_job_ts: sql.NullInt64{Int64: int64(getNextJobGenerateTs(className, true, 0, settings)), Valid: true}, settings_id: settings.id} add_job_info = append(add_job_info, row) } else { row = jiRows[job_info_key] } tt_row := &TimetableEntry{ class_name: className, default_retry: settings.retry_job, repeat: settings.repeat_job, method: METHOD_RUN, finished_successfully: 0, generation_id: sql.NullInt64{Int64: int64(row.generation_id), Valid: true}, settings_id: row.settings_id, location: location, created: uint64(now), } tt_row.NextLaunchTs.Valid = true tt_row.NextLaunchTs.Int64 = now if row.jobs_generated_ts.Valid || row.init_jobs_ts.Valid { if have_finish_jobs && !row.finish_jobs_ts.Valid { set_finish_jobs = append(set_finish_jobs, job_info_key) tt_row.JobData = `"finishJobs"` tt_row.method = METHOD_FINISH_JOBS tt_row.default_retry = settings.retry_job add_to_timetable = append(add_to_timetable, tt_row) } else { prepare_next_generation = append(prepare_next_generation, NextGenParams{Location: job_info_key, JobInfo: row}) } continue } else if row.next_generate_job_ts.Int64 > now { continue } if settings.jobs.Type == JOBS_TYPE_CUSTOM { set_init_jobs = append(set_init_jobs, job_info_key) tt_row.JobData = `"initJobs"` tt_row.method = METHOD_INIT_JOBS tt_row.default_retry = uint32(settings.retry.Int64) add_to_timetable = append(add_to_timetable, tt_row) continue } jobs, mjlErr := makeJobsList(settings.jobs, settings.instance_count, className) if mjlErr != nil { log.Warningf("Error generating jobs for %+v with instance_count=%d and jobs=%s: %s", className, settings.instance_count, settings.jobs, mjlErr.Error()) continue } for _, job := range jobs { tt_row_copy := new(TimetableEntry) *tt_row_copy = *tt_row tt_row_copy.JobData = job add_to_timetable = append(add_to_timetable, tt_row_copy) } set_jobs_generated_js = append(set_jobs_generated_js, job_info_key) } if err = addJobInfo(tx, add_job_info); err != nil { return } if err = setFinishJobsTs(tx, className, set_finish_jobs); err != nil { return } if err = batchPrepareNextGeneration(tx, have_finish_jobs, className, prepare_next_generation, settings); err != nil { return } if err = setInitJobsTs(tx, className, set_init_jobs); err != nil { return } if err = setJobsGeneratedTs(tx, className, set_jobs_generated_js); err != nil { return } if err = addToTimetable(tx, add_to_timetable); err != nil { return } return }
func (d *DispatcherData) redispatch() { returnToWaitingList := make([]*TimetableEntry, 0) defer func() { for _, row := range returnToWaitingList { d.addToWaiting(row) } }() now := uint64(time.Now().Unix()) newRqList := make([]*RunQueueEntry, 0) toDeleteFromWaitingList := make([]*TimetableEntry, 0) for l := d.waitingList.Len(); l > 0; l-- { row := heap.Pop(&d.waitingList).(*TimetableEntry) delete(d.waitingMap, row.id) if d.killRequest != nil { toDeleteFromWaitingList = append(toDeleteFromWaitingList, row) continue } if uint64(row.NextLaunchTs.Int64) > now { d.tickRedispatchCh = time.After(time.Second * time.Duration(uint64(row.NextLaunchTs.Int64)-now)) returnToWaitingList = append(returnToWaitingList, row) break } if len(d.addedMap) >= row.settings.instance_count { returnToWaitingList = append(returnToWaitingList, row) break } if _, ok := d.addedJobData[row.JobData]; ok { if !row.reportedDup { log.Warningf("Duplicate job %s for class %s and location %s", row.JobData, d.className, row.location) row.reportedDup = true } returnToWaitingList = append(returnToWaitingList, row) continue } if row.method == METHOD_RUN && row.settings.ttl > 0 && now > row.created+uint64(row.settings.ttl) { if row.finish_count == 0 { log.Warningf("Job expired before being run even once: job %s for class %s and location %s", row.JobData, d.className, row.location) } toDeleteFromWaitingList = append(toDeleteFromWaitingList, row) continue } // do not try to dispatch next ones if selectHostname failed, and do not forget to return the row as well hostname, err := selectHostname(row.location, row.settings.location_type, d.rusage.cpu_usage, d.rusage.max_memory) if err != nil { logFailedLocation(row.settings, row.location, err.Error()) d.tickRedispatchCh = time.After(time.Second) returnToWaitingList = append(returnToWaitingList, row) break } else { settings := row.settings if settings.location_type == LOCATION_TYPE_ANY && (settings.developer.String != "") && (settings.developer.String != "wwwrun") && ((now - uint64(settings.created)) <= DEVELOPER_CUSTOM_PATH_TIMEOUT) { hostname = DEVELOPER_DEBUG_HOSTNAME } log.Debugln("Selected ", hostname, " for ", row.location, " (loc_type=", settings.location_type, ")") } nullNow := sql.NullInt64{Valid: true, Int64: int64(now)} queueRow := &RunQueueEntry{ ClassName: d.className, timetable_id: sql.NullInt64{Valid: true, Int64: int64(row.id)}, generation_id: row.generation_id, hostname: hostname, hostname_idx: getHostnameIdx(hostname), JobData: row.JobData, method: row.method, created: nullNow, RunStatus: RUN_STATUS_WAITING, waiting_ts: nullNow, should_init_ts: nullNow, token: row.token, retry_attempt: row.retry_count, settings_id: row.settings_id, settings: row.settings, } newRqList = append(newRqList, queueRow) row.added_to_queue_ts.Valid = true row.added_to_queue_ts.Int64 = int64(now) d.addToAdded(row) } err := db.DoInLazyTransaction(func(tx *db.LazyTrx) error { return addToQueueAndDeleteExpired(tx, newRqList, toDeleteFromWaitingList) }) if err == nil { for _, row := range toDeleteFromWaitingList { d.deletedIds[row.id] = DELETE_IDS_KEEP_GENERATIONS } // all rows can expire by TTL in this loop, so check if it the case and notify job generator about it if len(toDeleteFromWaitingList) > 0 { d.checkZero("redispatch") } if len(newRqList) > 0 { perHost := make(map[string][]*RunQueueEntry) for _, row := range newRqList { perHost[row.hostname] = append(perHost[row.hostname], row) } for hostname, rows := range perHost { notifyAboutNewRQRows(hostname, rows, false) } } return } d.tickRedispatchCh = time.After(time.Second) // restore internal structures back in case of error log.Warnf("Could not add to run queue for class %s and location %s to database: %s", d.className, d.location, err.Error()) for _, rqRow := range newRqList { row, ok := d.addedMap[uint64(rqRow.timetable_id.Int64)] if ok { row.added_to_queue_ts.Valid = false row.added_to_queue_ts.Int64 = 0 row.id = uint64(rqRow.timetable_id.Int64) d.removeFromAdded(row) d.addToWaiting(row) } else { log.Warnf("Internal consistency error: could not find row with timetable id %d", rqRow.timetable_id) } } }
// process finished: // 1. send an error to ev.errorCh, nil if all is ok // 2. restore state upon failure func (d *DispatcherData) processFinished(ev *FinishEvent) { var err error defer func() { ev.errorCh <- err }() row, ok := d.addedMap[ev.timetable_id] if !ok { if rowWaiting, ok := d.waitingMap[ev.timetable_id]; ok { log.Warningf("Got 'finished' event about waiting timetable_id: %d, class=%s, location=%s, row=%+v", ev.timetable_id, d.className, d.location, rowWaiting) err = fmt.Errorf("timetable id is waiting: %d, class=%s, location=%s", ev.timetable_id, d.className, d.location) } else { log.Warningf("Got 'finished' event about unknown timetable_id: %d, class=%s, location=%s", ev.timetable_id, d.className, d.location) err = fmt.Errorf("Unknown timetable id: %d, class=%s, location=%s", ev.timetable_id, d.className, d.location) } return } now := uint64(time.Now().Unix()) // restore everything in case of error rowCopy := *row defer func() { if err != nil { log.Warnf("Restoring old tt row (error: %s) from %+v => %+v", err.Error(), row, rowCopy) *row = rowCopy } else { // TODO: update rusage estimation } }() if !ev.isInitial { if ev.success { row.finished_successfully = 1 } else { row.finished_successfully = 0 } row.finish_count++ if !ev.success { row.retry_count++ } else { row.retry_count = 0 } } row.NextLaunchTs.Valid = false row.NextLaunchTs.Int64 = 0 var ttl uint32 if row.method == METHOD_RUN { ttl = row.settings.ttl } // we should not delete entries that have ttl > 0 and have hit max retries because there is "repeat" field still shouldDelete := d.killRequest != nil || (ttl == 0 && (ev.success || row.retry_count >= row.settings.max_retries)) || (ttl > 0 && now > row.created+uint64(ttl)) cb := func(tx *db.LazyTrx) error { var err error if ev.run_id != 0 { if ev.deleteRq { err = deleteFromRunQueue(tx, []uint64{ev.run_id}, ev.prevStatus) } else { err = errors.New("unexpected deleteRq value") } if err != nil { return err } } if shouldDelete { return deleteAddedFromTimetable(tx, []uint64{ev.timetable_id}) } return logTTFinish(tx, row, ev.havePrevFinishCount, ev.prevFinishCount) } if shouldDelete { if err = db.DoInLazyTransaction(cb); err == nil { if row.id != ev.timetable_id { log.Warnf("Inconsistency of addedMap[%d] = row = %+v", ev.timetable_id, row) row.id = ev.timetable_id } d.removeFromAdded(row) d.deletedIds[ev.timetable_id] = DELETE_IDS_KEEP_GENERATIONS d.checkZero("processFinished") trigger(d.redispatchCh, "redispatch") } else { log.Warnf("could not process finished: %s", err.Error()) } return } next_launch_ts := int64(now) if ev.success && row.added_to_queue_ts.Valid { next_launch_ts = row.added_to_queue_ts.Int64 + row.repeat.Int64 } else if !ev.success { if row.retry_count < 3 { next_launch_ts += int64(row.default_retry) } else { e := row.retry_count - 2 if e >= 3 { e = 3 } next_launch_ts += (1 << e) * int64(row.default_retry) } } row.NextLaunchTs.Valid = true row.NextLaunchTs.Int64 = next_launch_ts row.finished_ts.Valid = false row.finished_ts.Int64 = 0 row.added_to_queue_ts.Valid = false row.added_to_queue_ts.Int64 = 0 if err = db.DoInLazyTransaction(cb); err == nil { d.removeFromAdded(row) d.addToWaiting(row) trigger(d.redispatchCh, "redispatch") } }
func setLoadEstimates(rqRows map[string]map[string]map[string]*RunQueueEntry, scriptsRusage map[string]*ScriptRusageEntry) { rusageInfo.Lock() defer rusageInfo.Unlock() rusageInfo.timetableRusage = make(map[uint64]*ScriptRusageEntry) rusageInfo.loadEstimate = make(map[string]*ScriptRusageEntry) for className, locRows := range rqRows { for _, jobRows := range locRows { for _, row := range jobRows { if _, ok := scriptsRusage[className]; !ok { continue } rusageRow := scriptsRusage[className] host := row.hostname if _, ok := rusageInfo.loadEstimate[host]; !ok { rusageInfo.loadEstimate[host] = &ScriptRusageEntry{ cpu_usage: rusageRow.cpu_usage, max_memory: rusageRow.max_memory, } } else { rusageInfo.loadEstimate[host].cpu_usage += rusageRow.cpu_usage rusageInfo.loadEstimate[host].max_memory += rusageRow.max_memory } rusageRowCopy := new(ScriptRusageEntry) *rusageRowCopy = *rusageRow rusageRowCopy.host = host rusageInfo.timetableRusage[uint64(row.timetable_id.Int64)] = rusageRowCopy } } } // If our estimation is greater than current server load, take our estimation // This should prevent new script framework from overloading the 'fast' servers with jobs for host, info := range rusageInfo.loadEstimate { if _, ok := rusageInfo.hostsInfo[host]; !ok { continue } hi := rusageInfo.hostsInfo[host] hi.real_cpu_idle_cores = hi.cpu_idle_cores hi.real_mem_free = uint64(hi.mem_free.Int64) hi.real_mem_cached = uint64(hi.mem_cached.Int64) hi.real_swap_used = uint64(hi.swap_used.Int64) currentInfo := rusageInfo.hostsInfo[host] // cpu_parasite is relative to number of cores idleCores := float64(currentInfo.cpu_cores)*(1.0-currentInfo.cpu_parasite.Float64) - float64(info.cpu_usage) parasiteMemory := getParasiteMemory(currentInfo) usedMemory := parasiteMemory + int64(info.max_memory) if usedMemory == 0 { log.Warningf("Used memory for %s is 0", host) continue } // max_memory can use swap, but parasite memory is resident swapRatio := float64(parasiteMemory) / float64(usedMemory) memFree := currentInfo.mem_total.Int64 - int64(usedMemory) - int64(float64(currentInfo.swap_used.Int64)*swapRatio) currentFreeMem := getFreeMem(currentInfo) if idleCores < currentInfo.cpu_idle_cores { if idleCores > 0 { hi.cpu_idle_cores = idleCores } else { hi.cpu_idle_cores = 0 } } if memFree < currentFreeMem { // empty cached and swap, because we use estimated values for memory hi.swap_used.Int64 = 0 hi.mem_cached.Int64 = 0 if memFree > 0 { hi.mem_free.Int64 = memFree } else { hi.mem_free.Int64 = 0 } } } }