Пример #1
0
func (d *LauncherData) persistFinishAndNotify(row *RunQueueEntry, success bool, prevStatus string) error {
	location, err := getLocationIdx(row.settings.location_type, d.hostname)
	if err != nil {
		log.Warningf("Could not get location idx for row %+v, settings: %+v, reason: %s", row, row.settings, err.Error())
		return err
	}

	if row.timetable_id.Int64 == 0 {
		err := db.DoInLazyTransaction(func(tx *db.LazyTrx) error {
			return deleteFromRunQueue(tx, []uint64{row.Id}, prevStatus)
		})

		if err != nil {
			log.Warningf("Could not delete incorrectly finished run queue entry in %+v: %s", row, err.Error())
			return err
		}
	} else {
		if err := notifyTTFinished(row.ClassName, location, uint64(row.timetable_id.Int64), row.Id, success, true, prevStatus); err != nil {
			log.Warningf("Could not notify about timetable finish: %s", err.Error())
			return err
		}
	}

	return nil
}
Пример #2
0
func (d *LauncherData) processWaitingInInit() {
	for run_id, cachedRow := range d.initMap {
		_, rawResp, err := d.call(&badoo_phproxyd.RequestCheck{Hash: proto.Uint64(run_id)})

		if shouldHaveFinished(cachedRow) {
			d.logIncorrectFinish(nil, cachedRow, RUN_STATUS_INIT)
			continue
		}

		if err != nil {
			continue
		}

		row, err := getRunningInfo(run_id)
		if err != nil {
			log.Warnf("Could not get running info from DB in method processWaitingInInit for hostname=%s, class=%s, run id=%d, err: %s", d.hostname, cachedRow.ClassName, run_id, err.Error())
			continue
		}

		if cachedRow.settings_id != row.settings_id {
			log.Warnf("Broken row in cache or db for id=%d, settings_id is different (cache=%d, db=%d)", run_id, cachedRow.settings_id, row.settings_id)
			continue
		}

		row.settings = cachedRow.settings

		switch resp := rawResp.(type) {
		case *badoo_phproxyd.ResponseCheck:
			d.logIncorrectFinish(resp, row, row.RunStatus)
		case *badoo_phproxyd.ResponseGeneric:
			result := -badoo_phproxyd.Errno(resp.GetErrorCode())

			if result == badoo_phproxyd.Errno_ERRNO_NOT_FOUND || result == badoo_phproxyd.Errno_ERRNO_ALREADY_RUNNING {
				// we should log intelligently when server is so slow that it does not launch script in 0.5 sec
				if time.Now().Unix()-INIT_TIMEOUT_SEC > row.should_init_ts.Int64 {
					action := KILL_ACTION_NO_ACTION
					if result == badoo_phproxyd.Errno_ERRNO_NOT_FOUND {
						action = KILL_ACTION_LOG_SCRIPT_FINISH_INIT
					}

					d.terminate(row, action)
				} else if result == badoo_phproxyd.Errno_ERRNO_NOT_FOUND {
					d.terminate(row, KILL_ACTION_SET_WAITING)
				}
			} else if result == badoo_phproxyd.Errno_ERRNO_FAILED_FINISHED {
				log.Warningf("Script %s finished with failure at %s", row.ClassName, d.hostname)
				d.logIncorrectFinish(nil, row, RUN_STATUS_INIT)
			} else if result == badoo_phproxyd.Errno_ERRNO_WAIT_FOR_FREE {
				log.Warningf("Waiting in init: Lost results for %s at %s", row.ClassName, d.hostname)
				d.logIncorrectFinish(nil, row, RUN_STATUS_INIT)
			} else {
				log.Warningf("Unexpected return code %d (%s) for check request for class=%s, hostname=%s", result, result, row.ClassName, d.hostname)
			}
		default:
			log.Warningf("Received unexpected result from phproxyd at %s: type %T, result: %v", d.hostname, rawResp, rawResp)
		}
	}
}
Пример #3
0
func (d *LauncherData) processAllRows(rows []*RunQueueEntry) {
	for _, row := range rows {
		if row.settings == nil {
			log.Warningf("Incorrect row in run queue, settings are invalid: %+v", row)
			continue
		}

		el, ok := d.allMap[row.Id]
		if !ok {
			if d.deletedIds[row.Id] == 0 {
				d.addToMaps(row)
			}
			continue
		}

		if statusPriority[row.RunStatus] > statusPriority[el.RunStatus] || row.RunStatus == RUN_STATUS_WAITING && el.RunStatus == RUN_STATUS_INIT && row.init_attempts > el.init_attempts {
			d.updateStatus(el, row.RunStatus)
			*el = *row
		}

		// external kill request has come
		if row.max_finished_ts.Valid && row.max_finished_ts.Int64 < el.max_finished_ts.Int64 {
			el.max_finished_ts = row.max_finished_ts
			el.stopped_employee_id = row.stopped_employee_id
		}
	}

	for runId, refCount := range d.deletedIds {
		if refCount--; refCount <= 0 {
			delete(d.deletedIds, runId)
		} else {
			d.deletedIds[runId] = refCount
		}
	}
}
Пример #4
0
func (d *LauncherData) acceptWaiting(jobs []*RunQueueEntry) {
	idsToSelect := make([]uint64, 0)
	settingsMap := make(map[uint64]*ScriptSettings)

	for _, row := range jobs {
		if row.settings == nil {
			log.Warningf("Incorrect row in run queue (waitingCh), settings are invalid: %+v", row)
			continue
		}

		if d.allMap[row.Id] != nil {
			continue
		}
		settingsMap[row.Id] = row.settings
		idsToSelect = append(idsToSelect, row.Id)
	}

	rqs, err := getRunningInfos(idsToSelect)
	if err != nil {
		log.Warnf("acceptWaiting could not select run_queue entries: %s", err.Error())
		return
	}
	for _, row := range rqs {
		row.settings = settingsMap[row.Id]
		d.addToMaps(row)
	}

	d.processWaiting()
}
Пример #5
0
func (d *DispatcherData) processNewJobsEv(ev *NewJobs) {
	ids := make([]uint64, 0, len(ev.rows))
	settingsMap := make(map[uint64]*ScriptSettings)

	for _, row := range ev.rows {
		if row.settings == nil {
			log.Warningf("Incorrect row in timetable (processNewJobsEv), settings are invalid: %+v", row)
			continue
		}

		if d.waitingMap[row.id] != nil || d.addedMap[row.id] != nil {
			continue
		}

		settingsMap[row.id] = row.settings
		ids = append(ids, row.id)
	}

	rows, err := selectTimetableByIds(ids)
	if err != nil {
		log.Warnf("could not select tt_enties for ids:%+v err:%s", ids, err.Error())
		return
	}

	for _, row := range rows {
		row.settings = settingsMap[row.id]
	}

	d.acceptNewJobs(rows)
}
Пример #6
0
func (d *LauncherData) addToMaps(row *RunQueueEntry) {
	if row.settings == nil {
		buf := make([]byte, 5000)
		n := runtime.Stack(buf, false)
		log.Warningf("Incorrect row in run queue (addToMaps), settings are invalid: %+v\ntrace:%s", row, buf[0:n])
		return
	}
	if m := d.getMapByStatus(row.RunStatus); m != nil {
		if LAUNCHER_DB_DEBUG {
			log.Printf("RQ row from db: id=%d, class=%s, job_data=%s, hostname=%s", row.Id, row.ClassName, row.JobData, row.hostname)
		}
		if d.allMap[row.Id] != nil {
			log.Warnf("Trying to add already added into run_queue (all map): %+v", row)
			return
		}
		if m[row.Id] != nil {
			log.Warnf("Trying to add already added into run_queue (own map): %+v", row)
			return
		}
		m[row.Id] = row
		d.allMap[row.Id] = row
	} else {
		log.Warnf("Broken run status: %+v", row)
	}
}
Пример #7
0
func (d *LauncherData) call(req proto.Message) (msg_id uint32, resp proto.Message, err error) {
	client := gpbrpc.NewClient(d.hostname+hostSuffix, &badoo_phproxyd.Gpbrpc, &gpbrpc.GpbsCodec, time.Second, time.Second)
	defer client.Close()
	msg_id, resp, err = client.Call(req)
	if err != nil {
		log.Warningf("Call failed for host %s for message %+v, got error: %s", d.hostname, req, err.Error())
	}
	return
}
Пример #8
0
func GenerateJobsCycle() {
	hostname, err := os.Hostname()
	if err != nil {
		log.Fatalf("Could not get hostname: %s", err.Error())
	}

	log.Print("Initial select from RunQueue and starting launcher goroutines")
	if err := selectRQAndNotify(); err != nil {
		log.Fatalf("Could not do initial select run queue: %s", err.Error())
	}

	log.Print("Initial select from Timetable")
	ttRows, err := selectTimetable()
	if err != nil {
		log.Fatalf("Could not do initial select timetable: %s", err.Error())
	}

	log.Print("Starting jobgen goroutines")
	if err := notifyForFullTTSelect(ttRows, false); err != nil {
		log.Fatalf("Could notify about timetable: %s", err.Error())
	}

	for {
		res, err := db.LockCycle(getLockName(), hostname)
		if err != nil || !res {
			if err == nil {
				log.Println("Could not get lock, another host holds it? Retrying in 10 seconds")
			} else {
				log.Warningf("Could not get lock, got DB error: ", err.Error())
			}

			time.Sleep(time.Second * 10)
			continue
		}

		// timer := pinba.TimerStart(map[string]string{"group": "jobgenerator"})
		startTs := time.Now().UnixNano()

		db.LogCycleStart(CYCLE_CLASS_NAME, hostname, 0)
		log.Debug("Cycle started")
		success := doCycle()
		log.Debug("Cycle finished")
		successInt := 1
		if !success {
			successInt = 0
		}
		db.LogCycleStop(CYCLE_CLASS_NAME, hostname, 0, successInt)

		passedMs := int64((time.Now().UnixNano() - startTs) / 1e6)

		if passedMs < cycleMs {
			time.Sleep(time.Duration(cycleMs-passedMs) * time.Millisecond)
		}
	}
}
Пример #9
0
func runqueuePeriodicSelectThread() {
	timer := time.Tick(time.Millisecond * time.Duration(ttReloadIntervalMs))

	for {
		<-timer

		if err := selectRQAndNotify(); err != nil {
			log.Warningf("Could not perform periodic timetable select: %s", err.Error())
		}
	}
}
Пример #10
0
// select everything from timetable, notify dispatcher threads and start them if needed
func notifyForFullTTSelect(classLocTTRows map[string]map[string][]*TimetableEntry, isExisting bool) error {
	settingsIds := make(map[uint64]bool)

	for _, locTTRows := range classLocTTRows {
		for _, ttRows := range locTTRows {
			for _, row := range ttRows {
				settingsIds[row.settings_id] = true
			}
		}
	}

	err := loadNewIds(settingsIds)
	if err != nil {
		return err
	}

	for className, locTTRows := range classLocTTRows {
		// TODO: ensure that "location" is always the same for "any" type of script, otherwise goroutine will receive
		// multiple notifications
		for location, ttRows := range locTTRows {
			anyRows := make([]*TimetableEntry, 0, len(locTTRows))
			eachRows := make([]*TimetableEntry, 0, len(locTTRows))

			allSettingsMutex.Lock()
			for _, row := range ttRows {
				row.settings = allSettings[row.settings_id]
			}
			allSettingsMutex.Unlock()

			for _, row := range ttRows {
				if row.settings == nil {
					log.Warningf("Incorrect row in timetable, settings are invalid: %+v", row)
					continue
				}

				if row.settings.location_type == LOCATION_TYPE_EACH {
					eachRows = append(eachRows, row)
				} else {
					anyRows = append(anyRows, row)
				}
			}

			if len(anyRows) > 0 {
				notifyAboutNewTTRows(className, DEFAULT_LOCATION_IDX, anyRows, isExisting)
			}

			if len(eachRows) > 0 {
				notifyAboutNewTTRows(className, location, eachRows, isExisting)
			}
		}
	}

	return nil
}
Пример #11
0
func (d *LauncherData) processRunningTooLong() {
	now := time.Now().Unix()

	for run_id, row := range d.runningMap {
		if !row.max_finished_ts.Valid || row.max_finished_ts.Int64 >= now {
			continue
		}

		if shouldHaveFinished(row) {
			d.logIncorrectFinish(nil, row, RUN_STATUS_RUNNING)
			continue
		}

		_, rawResp, err := d.call(&badoo_phproxyd.RequestCheck{Hash: proto.Uint64(run_id)})
		if err != nil {
			log.Warningf("Could not call check at hostname %s: %s", d.hostname, err.Error())
			continue
		}

		switch resp := rawResp.(type) {
		case *badoo_phproxyd.ResponseCheck:
			d.logIncorrectFinish(resp, row, RUN_STATUS_RUNNING)
		case *badoo_phproxyd.ResponseGeneric:
			code := -badoo_phproxyd.Errno(resp.GetErrorCode())

			if code == badoo_phproxyd.Errno_ERRNO_ALREADY_RUNNING {
				d.terminate(row, KILL_ACTION_NO_ACTION)
			} else if code == badoo_phproxyd.Errno_ERRNO_WAIT_FOR_FREE {
				d.logIncorrectFinish(nil, row, RUN_STATUS_RUNNING)
			} else if code == badoo_phproxyd.Errno_ERRNO_NOT_FOUND {
				d.terminate(row, KILL_ACTION_LOG_SCRIPT_FINISH_RUNNING)
			} else {
				log.Warningf("Unexpected error code %d (%s) from phproxyd at %s (process running too long)", code, resp.GetErrorText(), d.hostname)
			}
		default:
			log.Warningf("Received unexpected result from phproxyd at %s: type %T, result: %v", d.hostname, rawResp, rawResp)
		}
	}
}
Пример #12
0
func (p *LazyTrx) Exec(queryTpl string, args ...interface{}) (sql.Result, error) {
	q, err := p.prepareFirstQuery(queryTpl, args...)
	if err != nil {
		return nil, err
	}

	res, err := p.tx.Exec(q)

	if err != nil {
		trace := make([]byte, 8192)
		n := runtime.Stack(trace, false)
		log.Warningf("Failed SQL query:\n'%s',\n\nReason: '%s',\n\nStack trace: %s\n", q, err.Error(), trace[0:n])
	}

	return res, err
}
Пример #13
0
func (d *DispatcherData) acceptNewJobs(jobs []*TimetableEntry) {
	for _, row := range jobs {
		if row.settings == nil {
			buf := make([]byte, 5000)
			n := runtime.Stack(buf, false)
			log.Warningf("Incorrect row in timetable (acceptNewJobs), settings are invalid: %+v\ntrace:%s", row, buf[0:n])
			continue
		}

		// actualization from DB
		if row.added_to_queue_ts.Valid {
			d.addToAdded(row)
			if row.finished_ts.Valid && row.finish_count > 0 {
				row.finish_count--
				d.processFinished(&FinishEvent{timetable_id: row.id,
					success: row.finished_successfully != 0, errorCh: make(chan error, 1)})
			}
		} else {
			d.addToWaiting(row)
		}
	}

	trigger(d.redispatchCh, "redispatch")
}
Пример #14
0
func doCycle() bool {
	var (
		jiRows         map[string]map[string]*JobInfoEntry
		scripts        map[string]*ScriptEntry
		flags          map[string]*FlagEntry
		scriptsRusage  map[string]*ScriptRusageEntry
		classLocTTRows map[string]map[string][]*TimetableEntry
	)

	unifiedStartTs := time.Now().UnixNano()

	startTs := time.Now().UnixNano()
	err := loadFullState(
		&LoadStateFunc{name: "Scripts", fun: func() (err error) { scripts, err = getGroupedScriptsForPlatform(); return }},
		&LoadStateFunc{name: "JobInfo", fun: func() (err error) { jiRows, err = getGroupedJobInfo(); return }},
		&LoadStateFunc{name: "Flags", fun: func() (err error) { flags, err = getFlags(); return }},
		&LoadStateFunc{name: "ScriptsRusage", fun: func() (err error) { scriptsRusage, err = getScriptRusageStats(); return }},
		&LoadStateFunc{name: "ScriptTimetable", fun: func() (err error) { classLocTTRows, err = selectTimetable(); return }})

	if err != nil {
		log.Errorf("Failed to select state in doCycle: %s", err.Error())
		return false
	}

	log.Debugf("Loaded for %.5f sec", float64(time.Now().UnixNano()-startTs)/1e9)

	startTs = time.Now().UnixNano()
	err = loadSettingsFromRows(jiRows, scripts)
	if err != nil {
		log.Errorf("Could not load settings from rows: %s", err.Error())
		return false
	}

	func() {
		allSettingsMutex.Lock()
		defer allSettingsMutex.Unlock()

		for _, row := range scripts {
			row.settings = allSettings[row.settings_id]
		}
	}()

	scriptsMap.Lock()
	scriptsMap.v = scripts
	scriptsMap.Unlock()

	log.Debugf("  Selected %d rows from flags", len(flags))
	log.Debugf("  Selected %d rows from scripts rusage", len(scriptsRusage))
	log.Debugf("Load settings for %.5f sec", float64(time.Now().UnixNano()-startTs)/1e9)

	startTs = time.Now().UnixNano()

	// We should not try to generate jobs for scripts that are not present in Script table
	// But we should not forget settings (e.g. last generation_id) for that script
	for class_name := range jiRows {
		if _, ok := scripts[class_name]; !ok {
			delete(jiRows, class_name)
		}
	}

	log.Debugf("Selected all for %.5f sec", float64(time.Now().UnixNano()-unifiedStartTs)/1e9)

	startTs = time.Now().UnixNano()
	updateLoadEstimates()

	log.Debugf("Load estimates updated for %.5f sec", float64(time.Now().UnixNano()-startTs)/1e9)
	func() {
		rusageInfo.Lock()
		defer rusageInfo.Unlock()
		log.Debugf("Group hosts: %+v", rusageInfo.groupHosts)
	}()

	startTs = time.Now().UnixNano()

	failedLocationsMutex.Lock()
	failedLocations = make(map[string]bool)
	failedLocationsMutex.Unlock()

	success := true

	if len(scripts) > 0 {
		throttle.setIntervalCh <- time.Second / time.Duration(len(scripts))
	}

	trigger(throttle.c, "throttle, start of cycle")

	for className, script := range scripts {
		<-throttle.c

		tx := new(db.LazyTrx)
		err := tx.Begin()
		if err != nil {
			log.Errorf("Could not start transaction in job generate: %s", err.Error())
			success = false
			continue
		}

		have := make(map[string]bool)
		locTtRows := classLocTTRows[className]
		if locTtRows != nil {
			for rawLoc, v := range locTtRows {
				loc, err := getLocationIdx(script.settings.location_type, rawLoc)
				if err != nil {
					log.Warningf("Broken settings for class %s: %s", className, err.Error())
					loc = rawLoc
				}
				if len(v) > 0 {
					have[loc] = true
				}
			}
		}

		add_to_timetable, err := generateJobs(tx, className, script.settings, jiRows[className], have, flags[className])

		if err != nil {
			log.Errorf("Could generate jobs for class %s: %s", className, err.Error())
			tx.Rollback()
			success = false
			continue
		}

		err = tx.Commit()
		if err != nil {
			log.Errorf("Could not commit generate jobs for class %s: %s", className, err.Error())
			success = false
			continue
		}

		per_location := make(map[string][]*TimetableEntry)

		for _, row := range add_to_timetable {
			allSettingsMutex.Lock()
			row.settings = allSettings[row.settings_id]
			allSettingsMutex.Unlock()

			if row.settings == nil {
				log.Warningf("Internal inconsistency error: Invalid settings for generated row: %+v", row)
				continue
			}

			key := DEFAULT_LOCATION_IDX
			if row.settings.location_type == LOCATION_TYPE_EACH {
				key = row.location
			}

			if _, ok := per_location[key]; !ok {
				per_location[key] = make([]*TimetableEntry, 0)
			}

			per_location[key] = append(per_location[key], row)
		}

		for location, rows := range per_location {
			notifyAboutNewTTRows(className, location, rows, true)
		}
	}

	notifyForFullTTSelect(classLocTTRows, true)

	log.Debugf("Processed %d classes for %.5f sec", len(scripts), float64(time.Now().UnixNano()-startTs)/1e9)
	log.Debugf("Total %.5f sec", float64(time.Now().UnixNano()-unifiedStartTs)/1e9)

	return success
}
Пример #15
0
// haveTTRows must be nil if there are no timetable entries for any location
// otherwise it must have only true entries like map["location"] => true
// probably jobs generation can be simplified, it is just the way it is
func generateJobs(tx *db.LazyTrx, className string, settings *ScriptSettings, jiRows map[string]*JobInfoEntry, haveTTRows map[string]bool, flags *FlagEntry) (add_to_timetable []*TimetableEntry, err error) {
	if haveTTRows != nil && len(haveTTRows) == 0 {
		haveTTRows = nil
	}

	now := time.Now().Unix()

	add_to_timetable = make([]*TimetableEntry, 0)
	add_job_info := make([]*JobInfoEntry, 0)
	set_finish_jobs := make([]string, 0)
	set_init_jobs := make([]string, 0)
	set_jobs_generated_js := make([]string, 0)
	prepare_next_generation := make([]NextGenParams, 0)

	have_finish_jobs := settings.jobs.Have_finish_jobs
	is_any := (settings.location_type == LOCATION_TYPE_ANY)
	is_temporary := settings.jobs.Temporary
	temporary_can_run := false

	if flags != nil {
		if flags.kill_requested_ts.Valid {
			is_done := (haveTTRows == nil)
			if is_done {
				log.Printf("Class %s is done, all is ok", className)

				if !flags.killed_ts.Valid {
					tx.AddCommitCallback(func() { continueDispatchAfterKill(className) })
					if err = setKilledFlag(tx, className); err != nil {
						return
					}

					if err = prepareNextGeneration(tx, have_finish_jobs, className, settings); err != nil {
						return
					}
				}
			} else {
				log.Printf("Class %s is not done", className)

				startKilling(className)

				// not the best place to put it, but it works
				if err = setMaxFinishedTs(tx, className, flags.kill_request_employee_id.Int64, flags.kill_requested_ts.Int64); err != nil {
					return
				}
			}

			return
		}

		// Stop generating new job generations when we are on pause
		if flags.pause_requested_ts.Valid {
			is_done := generationFinished(className, haveTTRows, jiRows, settings)

			if is_done && !flags.paused_ts.Valid {
				if err = setPausedFlag(tx, className); err != nil {
					return
				}

				flags.paused_ts = sql.NullInt64{Int64: now, Valid: true}
			}

			if !is_any || flags.paused_ts.Valid {
				return
			}
		}

		if is_temporary && flags.run_requested_ts.Valid && is_any {
			// We accepted run request, which means that we already generated jobs
			if flags.run_accepted_ts.Valid {
				if generationFinished(className, haveTTRows, jiRows, settings) {
					if err = resetRunRequest(tx, className); err != nil {
						return
					}

					if err = prepareNextGeneration(tx, have_finish_jobs, className, settings); err != nil {
						return
					}

					return
				}
			} else {
				if err = setRunAccepted(tx, className); err != nil {
					return
				}
			}

			temporary_can_run = true
		}
	}

	if is_temporary && !temporary_can_run || settings.jobs.Type == JOBS_TYPE_NONE {
		return
	}

	locations := make([]string, 0)

	if !is_any {
		all_locations := getLocations(settings)
		timetable_locations := make(map[string]bool)

		if haveTTRows != nil {
			for location, _ := range haveTTRows {
				timetable_locations[location] = true
			}
		}

		// there can be failed hosts that are still running: we must really compare host names, not just counts
		for _, loc := range all_locations {
			if _, ok := timetable_locations[loc]; !ok {
				locations = append(locations, loc)
			}
		}

		if len(locations) == 0 {
			return
		}
	} else {
		if haveTTRows != nil && len(haveTTRows) > 0 {
			return
		}

		locations = getLocations(settings)
	}

	tt_location_type := LOCATION_TYPE_EACH
	if is_any {
		tt_location_type = LOCATION_TYPE_ANY
	}

	for _, location := range locations {
		job_info_key, gliErr := getLocationIdx(tt_location_type, location)
		if gliErr != nil {
			log.Warningf("Error getting location index for %s for location_type %s and location %s: %s", className, tt_location_type, location, gliErr.Error())
			continue
		}

		var row *JobInfoEntry

		if jiRows == nil || jiRows[job_info_key] == nil {
			row = &JobInfoEntry{generation_id: 0,
				class_name:           className,
				location:             job_info_key,
				next_generate_job_ts: sql.NullInt64{Int64: int64(getNextJobGenerateTs(className, true, 0, settings)), Valid: true},
				settings_id:          settings.id}

			add_job_info = append(add_job_info, row)
		} else {
			row = jiRows[job_info_key]
		}

		tt_row := &TimetableEntry{
			class_name:            className,
			default_retry:         settings.retry_job,
			repeat:                settings.repeat_job,
			method:                METHOD_RUN,
			finished_successfully: 0,
			generation_id:         sql.NullInt64{Int64: int64(row.generation_id), Valid: true},
			settings_id:           row.settings_id,
			location:              location,
			created:               uint64(now),
		}

		tt_row.NextLaunchTs.Valid = true
		tt_row.NextLaunchTs.Int64 = now

		if row.jobs_generated_ts.Valid || row.init_jobs_ts.Valid {
			if have_finish_jobs && !row.finish_jobs_ts.Valid {
				set_finish_jobs = append(set_finish_jobs, job_info_key)

				tt_row.JobData = `"finishJobs"`
				tt_row.method = METHOD_FINISH_JOBS
				tt_row.default_retry = settings.retry_job

				add_to_timetable = append(add_to_timetable, tt_row)
			} else {
				prepare_next_generation = append(prepare_next_generation, NextGenParams{Location: job_info_key, JobInfo: row})
			}

			continue
		} else if row.next_generate_job_ts.Int64 > now {
			continue
		}

		if settings.jobs.Type == JOBS_TYPE_CUSTOM {
			set_init_jobs = append(set_init_jobs, job_info_key)

			tt_row.JobData = `"initJobs"`
			tt_row.method = METHOD_INIT_JOBS
			tt_row.default_retry = uint32(settings.retry.Int64)

			add_to_timetable = append(add_to_timetable, tt_row)
			continue
		}

		jobs, mjlErr := makeJobsList(settings.jobs, settings.instance_count, className)
		if mjlErr != nil {
			log.Warningf("Error generating jobs for %+v with instance_count=%d and jobs=%s: %s", className, settings.instance_count, settings.jobs, mjlErr.Error())
			continue
		}

		for _, job := range jobs {
			tt_row_copy := new(TimetableEntry)
			*tt_row_copy = *tt_row
			tt_row_copy.JobData = job
			add_to_timetable = append(add_to_timetable, tt_row_copy)
		}

		set_jobs_generated_js = append(set_jobs_generated_js, job_info_key)
	}

	if err = addJobInfo(tx, add_job_info); err != nil {
		return
	}

	if err = setFinishJobsTs(tx, className, set_finish_jobs); err != nil {
		return
	}

	if err = batchPrepareNextGeneration(tx, have_finish_jobs, className, prepare_next_generation, settings); err != nil {
		return
	}

	if err = setInitJobsTs(tx, className, set_init_jobs); err != nil {
		return
	}

	if err = setJobsGeneratedTs(tx, className, set_jobs_generated_js); err != nil {
		return
	}

	if err = addToTimetable(tx, add_to_timetable); err != nil {
		return
	}

	return
}
Пример #16
0
func (d *DispatcherData) redispatch() {
	returnToWaitingList := make([]*TimetableEntry, 0)
	defer func() {
		for _, row := range returnToWaitingList {
			d.addToWaiting(row)
		}
	}()

	now := uint64(time.Now().Unix())

	newRqList := make([]*RunQueueEntry, 0)
	toDeleteFromWaitingList := make([]*TimetableEntry, 0)

	for l := d.waitingList.Len(); l > 0; l-- {
		row := heap.Pop(&d.waitingList).(*TimetableEntry)
		delete(d.waitingMap, row.id)

		if d.killRequest != nil {
			toDeleteFromWaitingList = append(toDeleteFromWaitingList, row)
			continue
		}

		if uint64(row.NextLaunchTs.Int64) > now {
			d.tickRedispatchCh = time.After(time.Second * time.Duration(uint64(row.NextLaunchTs.Int64)-now))
			returnToWaitingList = append(returnToWaitingList, row)
			break
		}

		if len(d.addedMap) >= row.settings.instance_count {
			returnToWaitingList = append(returnToWaitingList, row)
			break
		}

		if _, ok := d.addedJobData[row.JobData]; ok {
			if !row.reportedDup {
				log.Warningf("Duplicate job %s for class %s and location %s", row.JobData, d.className, row.location)
				row.reportedDup = true
			}

			returnToWaitingList = append(returnToWaitingList, row)
			continue
		}

		if row.method == METHOD_RUN && row.settings.ttl > 0 && now > row.created+uint64(row.settings.ttl) {
			if row.finish_count == 0 {
				log.Warningf("Job expired before being run even once: job %s for class %s and location %s", row.JobData, d.className, row.location)
			}
			toDeleteFromWaitingList = append(toDeleteFromWaitingList, row)
			continue
		}

		// do not try to dispatch next ones if selectHostname failed, and do not forget to return the row as well
		hostname, err := selectHostname(row.location, row.settings.location_type, d.rusage.cpu_usage, d.rusage.max_memory)
		if err != nil {
			logFailedLocation(row.settings, row.location, err.Error())
			d.tickRedispatchCh = time.After(time.Second)
			returnToWaitingList = append(returnToWaitingList, row)
			break
		} else {
			settings := row.settings
			if settings.location_type == LOCATION_TYPE_ANY && (settings.developer.String != "") && (settings.developer.String != "wwwrun") && ((now - uint64(settings.created)) <= DEVELOPER_CUSTOM_PATH_TIMEOUT) {
				hostname = DEVELOPER_DEBUG_HOSTNAME
			}
			log.Debugln("Selected ", hostname, " for ", row.location, " (loc_type=", settings.location_type, ")")
		}

		nullNow := sql.NullInt64{Valid: true, Int64: int64(now)}

		queueRow := &RunQueueEntry{
			ClassName:      d.className,
			timetable_id:   sql.NullInt64{Valid: true, Int64: int64(row.id)},
			generation_id:  row.generation_id,
			hostname:       hostname,
			hostname_idx:   getHostnameIdx(hostname),
			JobData:        row.JobData,
			method:         row.method,
			created:        nullNow,
			RunStatus:      RUN_STATUS_WAITING,
			waiting_ts:     nullNow,
			should_init_ts: nullNow,
			token:          row.token,
			retry_attempt:  row.retry_count,
			settings_id:    row.settings_id,
			settings:       row.settings,
		}

		newRqList = append(newRqList, queueRow)

		row.added_to_queue_ts.Valid = true
		row.added_to_queue_ts.Int64 = int64(now)

		d.addToAdded(row)
	}

	err := db.DoInLazyTransaction(func(tx *db.LazyTrx) error {
		return addToQueueAndDeleteExpired(tx, newRqList, toDeleteFromWaitingList)
	})

	if err == nil {
		for _, row := range toDeleteFromWaitingList {
			d.deletedIds[row.id] = DELETE_IDS_KEEP_GENERATIONS
		}

		// all rows can expire by TTL in this loop, so check if it the case and notify job generator about it
		if len(toDeleteFromWaitingList) > 0 {
			d.checkZero("redispatch")
		}

		if len(newRqList) > 0 {
			perHost := make(map[string][]*RunQueueEntry)
			for _, row := range newRqList {
				perHost[row.hostname] = append(perHost[row.hostname], row)
			}

			for hostname, rows := range perHost {
				notifyAboutNewRQRows(hostname, rows, false)
			}
		}

		return
	}

	d.tickRedispatchCh = time.After(time.Second)

	// restore internal structures back in case of error
	log.Warnf("Could not add to run queue for class %s and location %s to database: %s", d.className, d.location, err.Error())

	for _, rqRow := range newRqList {
		row, ok := d.addedMap[uint64(rqRow.timetable_id.Int64)]

		if ok {
			row.added_to_queue_ts.Valid = false
			row.added_to_queue_ts.Int64 = 0
			row.id = uint64(rqRow.timetable_id.Int64)
			d.removeFromAdded(row)
			d.addToWaiting(row)
		} else {
			log.Warnf("Internal consistency error: could not find row with timetable id %d", rqRow.timetable_id)
		}
	}
}
Пример #17
0
// process finished:
// 1. send an error to ev.errorCh, nil if all is ok
// 2. restore state upon failure
func (d *DispatcherData) processFinished(ev *FinishEvent) {
	var err error
	defer func() { ev.errorCh <- err }()

	row, ok := d.addedMap[ev.timetable_id]
	if !ok {

		if rowWaiting, ok := d.waitingMap[ev.timetable_id]; ok {
			log.Warningf("Got 'finished' event about waiting timetable_id: %d, class=%s, location=%s, row=%+v", ev.timetable_id, d.className, d.location, rowWaiting)
			err = fmt.Errorf("timetable id is waiting: %d, class=%s, location=%s", ev.timetable_id, d.className, d.location)
		} else {
			log.Warningf("Got 'finished' event about unknown timetable_id: %d, class=%s, location=%s", ev.timetable_id, d.className, d.location)
			err = fmt.Errorf("Unknown timetable id: %d, class=%s, location=%s", ev.timetable_id, d.className, d.location)
		}
		return
	}

	now := uint64(time.Now().Unix())

	// restore everything in case of error
	rowCopy := *row
	defer func() {
		if err != nil {
			log.Warnf("Restoring old tt row (error: %s) from %+v => %+v", err.Error(), row, rowCopy)
			*row = rowCopy
		} else {
			// TODO: update rusage estimation
		}
	}()

	if !ev.isInitial {
		if ev.success {
			row.finished_successfully = 1
		} else {
			row.finished_successfully = 0
		}

		row.finish_count++
		if !ev.success {
			row.retry_count++
		} else {
			row.retry_count = 0
		}
	}

	row.NextLaunchTs.Valid = false
	row.NextLaunchTs.Int64 = 0

	var ttl uint32
	if row.method == METHOD_RUN {
		ttl = row.settings.ttl
	}

	// we should not delete entries that have ttl > 0 and have hit max retries because there is "repeat" field still
	shouldDelete := d.killRequest != nil ||
		(ttl == 0 && (ev.success || row.retry_count >= row.settings.max_retries)) ||
		(ttl > 0 && now > row.created+uint64(ttl))

	cb := func(tx *db.LazyTrx) error {
		var err error

		if ev.run_id != 0 {
			if ev.deleteRq {
				err = deleteFromRunQueue(tx, []uint64{ev.run_id}, ev.prevStatus)
			} else {
				err = errors.New("unexpected deleteRq value")
			}

			if err != nil {
				return err
			}
		}

		if shouldDelete {
			return deleteAddedFromTimetable(tx, []uint64{ev.timetable_id})
		}

		return logTTFinish(tx, row, ev.havePrevFinishCount, ev.prevFinishCount)
	}

	if shouldDelete {
		if err = db.DoInLazyTransaction(cb); err == nil {
			if row.id != ev.timetable_id {
				log.Warnf("Inconsistency of addedMap[%d] = row = %+v", ev.timetable_id, row)
				row.id = ev.timetable_id
			}
			d.removeFromAdded(row)
			d.deletedIds[ev.timetable_id] = DELETE_IDS_KEEP_GENERATIONS

			d.checkZero("processFinished")

			trigger(d.redispatchCh, "redispatch")
		} else {
			log.Warnf("could not process finished: %s", err.Error())
		}

		return
	}

	next_launch_ts := int64(now)

	if ev.success && row.added_to_queue_ts.Valid {
		next_launch_ts = row.added_to_queue_ts.Int64 + row.repeat.Int64
	} else if !ev.success {
		if row.retry_count < 3 {
			next_launch_ts += int64(row.default_retry)
		} else {
			e := row.retry_count - 2
			if e >= 3 {
				e = 3
			}

			next_launch_ts += (1 << e) * int64(row.default_retry)
		}
	}

	row.NextLaunchTs.Valid = true
	row.NextLaunchTs.Int64 = next_launch_ts

	row.finished_ts.Valid = false
	row.finished_ts.Int64 = 0

	row.added_to_queue_ts.Valid = false
	row.added_to_queue_ts.Int64 = 0

	if err = db.DoInLazyTransaction(cb); err == nil {
		d.removeFromAdded(row)
		d.addToWaiting(row)

		trigger(d.redispatchCh, "redispatch")
	}
}
Пример #18
0
func setLoadEstimates(rqRows map[string]map[string]map[string]*RunQueueEntry, scriptsRusage map[string]*ScriptRusageEntry) {
	rusageInfo.Lock()
	defer rusageInfo.Unlock()

	rusageInfo.timetableRusage = make(map[uint64]*ScriptRusageEntry)
	rusageInfo.loadEstimate = make(map[string]*ScriptRusageEntry)

	for className, locRows := range rqRows {
		for _, jobRows := range locRows {
			for _, row := range jobRows {
				if _, ok := scriptsRusage[className]; !ok {
					continue
				}

				rusageRow := scriptsRusage[className]
				host := row.hostname

				if _, ok := rusageInfo.loadEstimate[host]; !ok {
					rusageInfo.loadEstimate[host] = &ScriptRusageEntry{
						cpu_usage:  rusageRow.cpu_usage,
						max_memory: rusageRow.max_memory,
					}
				} else {
					rusageInfo.loadEstimate[host].cpu_usage += rusageRow.cpu_usage
					rusageInfo.loadEstimate[host].max_memory += rusageRow.max_memory
				}

				rusageRowCopy := new(ScriptRusageEntry)
				*rusageRowCopy = *rusageRow
				rusageRowCopy.host = host

				rusageInfo.timetableRusage[uint64(row.timetable_id.Int64)] = rusageRowCopy
			}
		}
	}

	// If our estimation is greater than current server load, take our estimation
	// This should prevent new script framework from overloading the 'fast' servers with jobs
	for host, info := range rusageInfo.loadEstimate {
		if _, ok := rusageInfo.hostsInfo[host]; !ok {
			continue
		}

		hi := rusageInfo.hostsInfo[host]

		hi.real_cpu_idle_cores = hi.cpu_idle_cores
		hi.real_mem_free = uint64(hi.mem_free.Int64)
		hi.real_mem_cached = uint64(hi.mem_cached.Int64)
		hi.real_swap_used = uint64(hi.swap_used.Int64)

		currentInfo := rusageInfo.hostsInfo[host]
		// cpu_parasite is relative to number of cores
		idleCores := float64(currentInfo.cpu_cores)*(1.0-currentInfo.cpu_parasite.Float64) - float64(info.cpu_usage)

		parasiteMemory := getParasiteMemory(currentInfo)

		usedMemory := parasiteMemory + int64(info.max_memory)
		if usedMemory == 0 {
			log.Warningf("Used memory for %s is 0", host)
			continue
		}

		// max_memory can use swap, but parasite memory is resident
		swapRatio := float64(parasiteMemory) / float64(usedMemory)

		memFree := currentInfo.mem_total.Int64 - int64(usedMemory) - int64(float64(currentInfo.swap_used.Int64)*swapRatio)
		currentFreeMem := getFreeMem(currentInfo)

		if idleCores < currentInfo.cpu_idle_cores {
			if idleCores > 0 {
				hi.cpu_idle_cores = idleCores
			} else {
				hi.cpu_idle_cores = 0
			}
		}

		if memFree < currentFreeMem {
			// empty cached and swap, because we use estimated values for memory
			hi.swap_used.Int64 = 0
			hi.mem_cached.Int64 = 0
			if memFree > 0 {
				hi.mem_free.Int64 = memFree
			} else {
				hi.mem_free.Int64 = 0
			}
		}
	}
}