func (d *LauncherData) persistFinishAndNotify(row *RunQueueEntry, success bool, prevStatus string) error { location, err := getLocationIdx(row.settings.location_type, d.hostname) if err != nil { log.Warningf("Could not get location idx for row %+v, settings: %+v, reason: %s", row, row.settings, err.Error()) return err } if row.timetable_id.Int64 == 0 { err := db.DoInLazyTransaction(func(tx *db.LazyTrx) error { return deleteFromRunQueue(tx, []uint64{row.Id}, prevStatus) }) if err != nil { log.Warningf("Could not delete incorrectly finished run queue entry in %+v: %s", row, err.Error()) return err } } else { if err := notifyTTFinished(row.ClassName, location, uint64(row.timetable_id.Int64), row.Id, success, true, prevStatus); err != nil { log.Warningf("Could not notify about timetable finish: %s", err.Error()) return err } } return nil }
func (d *LauncherData) processFinished() { var finishedIds []uint64 var err error for run_id := range d.finishedMap { d.call(&badoo_phproxyd.RequestFree{Hash: proto.Uint64(run_id)}) finishedIds = append(finishedIds, run_id) } if finishedIds == nil || len(finishedIds) == 0 { return } sort.Sort(common.UInt64Slice(finishedIds)) err = db.DoInLazyTransaction(func(tx *db.LazyTrx) error { return deleteFromRunQueue(tx, finishedIds, RUN_STATUS_FINISHED) }) if err != nil { log.Errorf("Could not delete rows from run queue for hostname=%s: %s", d.hostname, err.Error()) return } for _, v := range d.finishedMap { d.delFromMaps(v.Id) } }
func (d *LauncherData) processUpdateStatusRequest(req *LauncherUpdateStatusRequest) { var err error defer func() { req.errCh <- err }() el := d.allMap[req.RunId] if el == nil { err = fmt.Errorf("No such rq row id=%d", req.RunId) return } if el.RunStatus != req.PrevStatus { err = fmt.Errorf("Previous status mismatch for rq row id=%d: req.prev=%s, actual=%s", req.RunId, req.PrevStatus, el.RunStatus) return } err = db.DoInLazyTransaction(func(tx *db.LazyTrx) error { return updateRunStatus(tx, req.RunId, req.Status, req.PrevStatus) }) if err != nil { log.Errorf("Could not update run status of run_id=%d to %s: %s", req.RunId, req.Status, err.Error()) return } d.updateStatus(el, req.Status) }
func (d *DispatcherData) redispatch() { returnToWaitingList := make([]*TimetableEntry, 0) defer func() { for _, row := range returnToWaitingList { d.addToWaiting(row) } }() now := uint64(time.Now().Unix()) newRqList := make([]*RunQueueEntry, 0) toDeleteFromWaitingList := make([]*TimetableEntry, 0) for l := d.waitingList.Len(); l > 0; l-- { row := heap.Pop(&d.waitingList).(*TimetableEntry) delete(d.waitingMap, row.id) if d.killRequest != nil { toDeleteFromWaitingList = append(toDeleteFromWaitingList, row) continue } if uint64(row.NextLaunchTs.Int64) > now { d.tickRedispatchCh = time.After(time.Second * time.Duration(uint64(row.NextLaunchTs.Int64)-now)) returnToWaitingList = append(returnToWaitingList, row) break } if len(d.addedMap) >= row.settings.instance_count { returnToWaitingList = append(returnToWaitingList, row) break } if _, ok := d.addedJobData[row.JobData]; ok { if !row.reportedDup { log.Warningf("Duplicate job %s for class %s and location %s", row.JobData, d.className, row.location) row.reportedDup = true } returnToWaitingList = append(returnToWaitingList, row) continue } if row.method == METHOD_RUN && row.settings.ttl > 0 && now > row.created+uint64(row.settings.ttl) { if row.finish_count == 0 { log.Warningf("Job expired before being run even once: job %s for class %s and location %s", row.JobData, d.className, row.location) } toDeleteFromWaitingList = append(toDeleteFromWaitingList, row) continue } // do not try to dispatch next ones if selectHostname failed, and do not forget to return the row as well hostname, err := selectHostname(row.location, row.settings.location_type, d.rusage.cpu_usage, d.rusage.max_memory) if err != nil { logFailedLocation(row.settings, row.location, err.Error()) d.tickRedispatchCh = time.After(time.Second) returnToWaitingList = append(returnToWaitingList, row) break } else { settings := row.settings if settings.location_type == LOCATION_TYPE_ANY && (settings.developer.String != "") && (settings.developer.String != "wwwrun") && ((now - uint64(settings.created)) <= DEVELOPER_CUSTOM_PATH_TIMEOUT) { hostname = DEVELOPER_DEBUG_HOSTNAME } log.Debugln("Selected ", hostname, " for ", row.location, " (loc_type=", settings.location_type, ")") } nullNow := sql.NullInt64{Valid: true, Int64: int64(now)} queueRow := &RunQueueEntry{ ClassName: d.className, timetable_id: sql.NullInt64{Valid: true, Int64: int64(row.id)}, generation_id: row.generation_id, hostname: hostname, hostname_idx: getHostnameIdx(hostname), JobData: row.JobData, method: row.method, created: nullNow, RunStatus: RUN_STATUS_WAITING, waiting_ts: nullNow, should_init_ts: nullNow, token: row.token, retry_attempt: row.retry_count, settings_id: row.settings_id, settings: row.settings, } newRqList = append(newRqList, queueRow) row.added_to_queue_ts.Valid = true row.added_to_queue_ts.Int64 = int64(now) d.addToAdded(row) } err := db.DoInLazyTransaction(func(tx *db.LazyTrx) error { return addToQueueAndDeleteExpired(tx, newRqList, toDeleteFromWaitingList) }) if err == nil { for _, row := range toDeleteFromWaitingList { d.deletedIds[row.id] = DELETE_IDS_KEEP_GENERATIONS } // all rows can expire by TTL in this loop, so check if it the case and notify job generator about it if len(toDeleteFromWaitingList) > 0 { d.checkZero("redispatch") } if len(newRqList) > 0 { perHost := make(map[string][]*RunQueueEntry) for _, row := range newRqList { perHost[row.hostname] = append(perHost[row.hostname], row) } for hostname, rows := range perHost { notifyAboutNewRQRows(hostname, rows, false) } } return } d.tickRedispatchCh = time.After(time.Second) // restore internal structures back in case of error log.Warnf("Could not add to run queue for class %s and location %s to database: %s", d.className, d.location, err.Error()) for _, rqRow := range newRqList { row, ok := d.addedMap[uint64(rqRow.timetable_id.Int64)] if ok { row.added_to_queue_ts.Valid = false row.added_to_queue_ts.Int64 = 0 row.id = uint64(rqRow.timetable_id.Int64) d.removeFromAdded(row) d.addToWaiting(row) } else { log.Warnf("Internal consistency error: could not find row with timetable id %d", rqRow.timetable_id) } } }
// process finished: // 1. send an error to ev.errorCh, nil if all is ok // 2. restore state upon failure func (d *DispatcherData) processFinished(ev *FinishEvent) { var err error defer func() { ev.errorCh <- err }() row, ok := d.addedMap[ev.timetable_id] if !ok { if rowWaiting, ok := d.waitingMap[ev.timetable_id]; ok { log.Warningf("Got 'finished' event about waiting timetable_id: %d, class=%s, location=%s, row=%+v", ev.timetable_id, d.className, d.location, rowWaiting) err = fmt.Errorf("timetable id is waiting: %d, class=%s, location=%s", ev.timetable_id, d.className, d.location) } else { log.Warningf("Got 'finished' event about unknown timetable_id: %d, class=%s, location=%s", ev.timetable_id, d.className, d.location) err = fmt.Errorf("Unknown timetable id: %d, class=%s, location=%s", ev.timetable_id, d.className, d.location) } return } now := uint64(time.Now().Unix()) // restore everything in case of error rowCopy := *row defer func() { if err != nil { log.Warnf("Restoring old tt row (error: %s) from %+v => %+v", err.Error(), row, rowCopy) *row = rowCopy } else { // TODO: update rusage estimation } }() if !ev.isInitial { if ev.success { row.finished_successfully = 1 } else { row.finished_successfully = 0 } row.finish_count++ if !ev.success { row.retry_count++ } else { row.retry_count = 0 } } row.NextLaunchTs.Valid = false row.NextLaunchTs.Int64 = 0 var ttl uint32 if row.method == METHOD_RUN { ttl = row.settings.ttl } // we should not delete entries that have ttl > 0 and have hit max retries because there is "repeat" field still shouldDelete := d.killRequest != nil || (ttl == 0 && (ev.success || row.retry_count >= row.settings.max_retries)) || (ttl > 0 && now > row.created+uint64(ttl)) cb := func(tx *db.LazyTrx) error { var err error if ev.run_id != 0 { if ev.deleteRq { err = deleteFromRunQueue(tx, []uint64{ev.run_id}, ev.prevStatus) } else { err = errors.New("unexpected deleteRq value") } if err != nil { return err } } if shouldDelete { return deleteAddedFromTimetable(tx, []uint64{ev.timetable_id}) } return logTTFinish(tx, row, ev.havePrevFinishCount, ev.prevFinishCount) } if shouldDelete { if err = db.DoInLazyTransaction(cb); err == nil { if row.id != ev.timetable_id { log.Warnf("Inconsistency of addedMap[%d] = row = %+v", ev.timetable_id, row) row.id = ev.timetable_id } d.removeFromAdded(row) d.deletedIds[ev.timetable_id] = DELETE_IDS_KEEP_GENERATIONS d.checkZero("processFinished") trigger(d.redispatchCh, "redispatch") } else { log.Warnf("could not process finished: %s", err.Error()) } return } next_launch_ts := int64(now) if ev.success && row.added_to_queue_ts.Valid { next_launch_ts = row.added_to_queue_ts.Int64 + row.repeat.Int64 } else if !ev.success { if row.retry_count < 3 { next_launch_ts += int64(row.default_retry) } else { e := row.retry_count - 2 if e >= 3 { e = 3 } next_launch_ts += (1 << e) * int64(row.default_retry) } } row.NextLaunchTs.Valid = true row.NextLaunchTs.Int64 = next_launch_ts row.finished_ts.Valid = false row.finished_ts.Int64 = 0 row.added_to_queue_ts.Valid = false row.added_to_queue_ts.Int64 = 0 if err = db.DoInLazyTransaction(cb); err == nil { d.removeFromAdded(row) d.addToWaiting(row) trigger(d.redispatchCh, "redispatch") } }
func (d *LauncherData) processWaiting() { // invalidEntries := make([]uint64, 0) var rawResp proto.Message for run_id, row := range d.waitingMap { err := db.DoInLazyTransaction(func(tx *db.LazyTrx) error { return setRunStatusToInit(tx, run_id, row.settings.max_time) }) if err != nil { log.Errorf("Could not update run status of run_id=%d to %s: %s", run_id, RUN_STATUS_INIT, err.Error()) return } // TODO: add host unreachable check d.updateStatus(row, RUN_STATUS_INIT) row.max_finished_ts.Int64 = row.created.Int64 + int64(row.settings.max_time) row.max_finished_ts.Valid = true script := getScriptPath(row.settings) params := []string{ fmt.Sprintf("--id=%d", row.Id), row.ClassName, fmt.Sprintf("--instance-count=%d", row.settings.instance_count), fmt.Sprintf("--settings-id=%d", row.settings_id), fmt.Sprintf("--method=%s", row.method), fmt.Sprintf("--token=%s", row.token), fmt.Sprintf("--retry-attempt=%d", row.retry_attempt), fmt.Sprintf("--max-retries=%d", row.settings.max_retries), fmt.Sprintf("--max-ts=%d", row.created.Int64+int64(row.settings.max_time)), fmt.Sprintf("--force-sf-db=%s", db.GetDbName()), } if row.settings.named_params.Valid && row.settings.named_params.String != "" { params = append(params, fmt.Sprintf("--named-params=%s", row.settings.named_params.String)) } if row.JobData != "" { params = append(params, fmt.Sprintf("--job-data=%s", row.JobData)) } if testId := os.Getenv("PHPUNIT_SELENIUM_TEST_ID"); testId != "" { params = append(params, fmt.Sprintf("--PHPUNIT_SELENIUM_TEST_ID=%s", testId)) } if row.settings.debug_enabled == 1 && row.settings.created > time.Now().Unix()-DEBUG_TIMEOUT { params = append(params, "--debug-mode") } if row.settings.profiling_enabled == 1 && row.settings.created > time.Now().Unix()-PROFILING_TIMEOUT { params = append(params, "--enable-profiling") } if row.timetable_id.Valid && row.timetable_id.Int64 != 0 { params = append(params, fmt.Sprintf("--timetable-id=%d", row.timetable_id.Int64)) } ev := &badoo_phproxyd.RequestRun{ Script: proto.String(script), Hash: proto.Uint64(row.Id), Tag: proto.String(PHPROXY_TAG), Force: proto.Int32(1), Params: params, Store: badoo_phproxyd.StoreT_FILES.Enum(), FreeAfterRun: proto.Bool(false), } _, rawResp, err = d.call(ev) if err != nil { continue } resp, ok := rawResp.(*badoo_phproxyd.ResponseGeneric) if !ok { log.Errorf("Unexpected response from host %s when doing run, type: %T, response: %+v", d.hostname, rawResp, rawResp) continue } if resp.GetErrorCode() != 0 { log.Errorf("Unexpected response from host %s when doing run, got code %d and text %s", d.hostname, resp.GetErrorCode(), resp.GetErrorText()) continue } } }
func APIAcceptTTJobs(jobs []*thunder.RequestAddJobsJobT) ([]uint64, error) { now := uint64(time.Now().Unix()) classLocType := make(map[string]string) // class_name => location_type ttRows := make([]*TimetableEntry, 0, len(jobs)) perClassLoc := make(map[string]map[string][]*TimetableEntry) for _, row := range jobs { settings, err := getScriptSettings(row.GetClassName()) if err != nil { return nil, err } classLocType[row.GetClassName()] = settings.location_type jrow := new(TimetableEntry) jrow.class_name = row.GetClassName() if row.Repeat == nil { jrow.repeat = settings.repeat_job } else { if row.Repeat.Value == nil { jrow.repeat.Valid = false } else { jrow.repeat.Valid = true jrow.repeat.Int64 = int64(row.Repeat.GetValue()) } } jrow.default_retry = settings.retry_job jrow.created = now if row.SettingsId == nil { jrow.settings_id = settings.id jrow.settings = settings } else { jrow.settings_id = row.GetSettingsId() allSettingsMutex.Lock() jrow.settings = allSettings[jrow.settings_id] allSettingsMutex.Unlock() if jrow.settings == nil { ids := make(map[uint64]bool) ids[jrow.settings_id] = true err := loadNewIds(ids) if err != nil { return nil, err } allSettingsMutex.Lock() jrow.settings = allSettings[jrow.settings_id] allSettingsMutex.Unlock() if jrow.settings == nil { return nil, errors.New(fmt.Sprintf("Incorrect value of settings_id: %v", jrow.settings_id)) } } if jrow.settings.location_type != settings.location_type { return nil, errors.New(fmt.Sprintf("You are not allowed to specify settings_id that has different location_type, row: %+v", row)) } } jrow.NextLaunchTs.Valid = true if row.NextLaunchTs == nil { jrow.NextLaunchTs.Int64 = int64(now) } else { jrow.NextLaunchTs.Int64 = row.GetNextLaunchTs() } if row.Location == nil { jrow.location = settings.location } else { jrow.location = row.GetLocation() if settings.location_type == LOCATION_TYPE_ANY && jrow.settings.location != settings.location { return nil, errors.New(fmt.Sprintf("For location_type=any scripts location field must be equal to current settings: %+v", row)) } } jrow.JobData = row.GetJobData() if row.Method == nil { jrow.method = METHOD_RUN } else { jrow.method = row.GetMethod() } if row.GenerationId == nil { jrow.generation_id.Valid = false } else { jrow.generation_id.Valid = true jrow.generation_id.Int64 = row.GetGenerationId() } ttRows = append(ttRows, jrow) el, ok := perClassLoc[jrow.class_name] if !ok { el = make(map[string][]*TimetableEntry) perClassLoc[jrow.class_name] = el } el[jrow.location] = append(el[jrow.location], jrow) } for className, locRows := range perClassLoc { for location, rows := range locRows { key := DEFAULT_LOCATION_IDX if classLocType[className] == LOCATION_TYPE_EACH { key = location } currentCnt := 0 if ch := getDispatcherJobsCountCh(className, key); ch != nil { respCh := make(chan int, 1) ch <- &JobsCountRequest{RespCh: respCh} currentCnt = <-respCh } if currentCnt+len(rows) > MAX_API_JOBS { return nil, fmt.Errorf("Too many jobs: %d (current) + %d (adding) > %d (max)", currentCnt, len(rows), MAX_API_JOBS) } } } err := db.DoInLazyTransaction(func(tx *db.LazyTrx) error { return addToTimetable(tx, ttRows) }) if err != nil { return nil, err } for className, locRows := range perClassLoc { for location, rows := range locRows { key := DEFAULT_LOCATION_IDX if classLocType[className] == LOCATION_TYPE_EACH { key = location } notifyAboutNewTTRows(className, key, rows, false) } } ids := make([]uint64, 0, len(ttRows)) for _, row := range ttRows { ids = append(ids, row.id) } return ids, nil }