示例#1
0
文件: launcher.go 项目: badoo/thunder
func (d *LauncherData) persistFinishAndNotify(row *RunQueueEntry, success bool, prevStatus string) error {
	location, err := getLocationIdx(row.settings.location_type, d.hostname)
	if err != nil {
		log.Warningf("Could not get location idx for row %+v, settings: %+v, reason: %s", row, row.settings, err.Error())
		return err
	}

	if row.timetable_id.Int64 == 0 {
		err := db.DoInLazyTransaction(func(tx *db.LazyTrx) error {
			return deleteFromRunQueue(tx, []uint64{row.Id}, prevStatus)
		})

		if err != nil {
			log.Warningf("Could not delete incorrectly finished run queue entry in %+v: %s", row, err.Error())
			return err
		}
	} else {
		if err := notifyTTFinished(row.ClassName, location, uint64(row.timetable_id.Int64), row.Id, success, true, prevStatus); err != nil {
			log.Warningf("Could not notify about timetable finish: %s", err.Error())
			return err
		}
	}

	return nil
}
示例#2
0
文件: launcher.go 项目: badoo/thunder
func (d *LauncherData) processFinished() {
	var finishedIds []uint64
	var err error

	for run_id := range d.finishedMap {
		d.call(&badoo_phproxyd.RequestFree{Hash: proto.Uint64(run_id)})
		finishedIds = append(finishedIds, run_id)
	}

	if finishedIds == nil || len(finishedIds) == 0 {
		return
	}

	sort.Sort(common.UInt64Slice(finishedIds))

	err = db.DoInLazyTransaction(func(tx *db.LazyTrx) error {
		return deleteFromRunQueue(tx, finishedIds, RUN_STATUS_FINISHED)
	})

	if err != nil {
		log.Errorf("Could not delete rows from run queue for hostname=%s: %s", d.hostname, err.Error())
		return
	}

	for _, v := range d.finishedMap {
		d.delFromMaps(v.Id)
	}
}
示例#3
0
文件: launcher.go 项目: badoo/thunder
func (d *LauncherData) processUpdateStatusRequest(req *LauncherUpdateStatusRequest) {
	var err error
	defer func() { req.errCh <- err }()

	el := d.allMap[req.RunId]

	if el == nil {
		err = fmt.Errorf("No such rq row id=%d", req.RunId)
		return
	}

	if el.RunStatus != req.PrevStatus {
		err = fmt.Errorf("Previous status mismatch for rq row id=%d: req.prev=%s, actual=%s", req.RunId, req.PrevStatus, el.RunStatus)
		return
	}

	err = db.DoInLazyTransaction(func(tx *db.LazyTrx) error {
		return updateRunStatus(tx, req.RunId, req.Status, req.PrevStatus)
	})

	if err != nil {
		log.Errorf("Could not update run status of run_id=%d to %s: %s", req.RunId, req.Status, err.Error())
		return
	}

	d.updateStatus(el, req.Status)
}
示例#4
0
func (d *DispatcherData) redispatch() {
	returnToWaitingList := make([]*TimetableEntry, 0)
	defer func() {
		for _, row := range returnToWaitingList {
			d.addToWaiting(row)
		}
	}()

	now := uint64(time.Now().Unix())

	newRqList := make([]*RunQueueEntry, 0)
	toDeleteFromWaitingList := make([]*TimetableEntry, 0)

	for l := d.waitingList.Len(); l > 0; l-- {
		row := heap.Pop(&d.waitingList).(*TimetableEntry)
		delete(d.waitingMap, row.id)

		if d.killRequest != nil {
			toDeleteFromWaitingList = append(toDeleteFromWaitingList, row)
			continue
		}

		if uint64(row.NextLaunchTs.Int64) > now {
			d.tickRedispatchCh = time.After(time.Second * time.Duration(uint64(row.NextLaunchTs.Int64)-now))
			returnToWaitingList = append(returnToWaitingList, row)
			break
		}

		if len(d.addedMap) >= row.settings.instance_count {
			returnToWaitingList = append(returnToWaitingList, row)
			break
		}

		if _, ok := d.addedJobData[row.JobData]; ok {
			if !row.reportedDup {
				log.Warningf("Duplicate job %s for class %s and location %s", row.JobData, d.className, row.location)
				row.reportedDup = true
			}

			returnToWaitingList = append(returnToWaitingList, row)
			continue
		}

		if row.method == METHOD_RUN && row.settings.ttl > 0 && now > row.created+uint64(row.settings.ttl) {
			if row.finish_count == 0 {
				log.Warningf("Job expired before being run even once: job %s for class %s and location %s", row.JobData, d.className, row.location)
			}
			toDeleteFromWaitingList = append(toDeleteFromWaitingList, row)
			continue
		}

		// do not try to dispatch next ones if selectHostname failed, and do not forget to return the row as well
		hostname, err := selectHostname(row.location, row.settings.location_type, d.rusage.cpu_usage, d.rusage.max_memory)
		if err != nil {
			logFailedLocation(row.settings, row.location, err.Error())
			d.tickRedispatchCh = time.After(time.Second)
			returnToWaitingList = append(returnToWaitingList, row)
			break
		} else {
			settings := row.settings
			if settings.location_type == LOCATION_TYPE_ANY && (settings.developer.String != "") && (settings.developer.String != "wwwrun") && ((now - uint64(settings.created)) <= DEVELOPER_CUSTOM_PATH_TIMEOUT) {
				hostname = DEVELOPER_DEBUG_HOSTNAME
			}
			log.Debugln("Selected ", hostname, " for ", row.location, " (loc_type=", settings.location_type, ")")
		}

		nullNow := sql.NullInt64{Valid: true, Int64: int64(now)}

		queueRow := &RunQueueEntry{
			ClassName:      d.className,
			timetable_id:   sql.NullInt64{Valid: true, Int64: int64(row.id)},
			generation_id:  row.generation_id,
			hostname:       hostname,
			hostname_idx:   getHostnameIdx(hostname),
			JobData:        row.JobData,
			method:         row.method,
			created:        nullNow,
			RunStatus:      RUN_STATUS_WAITING,
			waiting_ts:     nullNow,
			should_init_ts: nullNow,
			token:          row.token,
			retry_attempt:  row.retry_count,
			settings_id:    row.settings_id,
			settings:       row.settings,
		}

		newRqList = append(newRqList, queueRow)

		row.added_to_queue_ts.Valid = true
		row.added_to_queue_ts.Int64 = int64(now)

		d.addToAdded(row)
	}

	err := db.DoInLazyTransaction(func(tx *db.LazyTrx) error {
		return addToQueueAndDeleteExpired(tx, newRqList, toDeleteFromWaitingList)
	})

	if err == nil {
		for _, row := range toDeleteFromWaitingList {
			d.deletedIds[row.id] = DELETE_IDS_KEEP_GENERATIONS
		}

		// all rows can expire by TTL in this loop, so check if it the case and notify job generator about it
		if len(toDeleteFromWaitingList) > 0 {
			d.checkZero("redispatch")
		}

		if len(newRqList) > 0 {
			perHost := make(map[string][]*RunQueueEntry)
			for _, row := range newRqList {
				perHost[row.hostname] = append(perHost[row.hostname], row)
			}

			for hostname, rows := range perHost {
				notifyAboutNewRQRows(hostname, rows, false)
			}
		}

		return
	}

	d.tickRedispatchCh = time.After(time.Second)

	// restore internal structures back in case of error
	log.Warnf("Could not add to run queue for class %s and location %s to database: %s", d.className, d.location, err.Error())

	for _, rqRow := range newRqList {
		row, ok := d.addedMap[uint64(rqRow.timetable_id.Int64)]

		if ok {
			row.added_to_queue_ts.Valid = false
			row.added_to_queue_ts.Int64 = 0
			row.id = uint64(rqRow.timetable_id.Int64)
			d.removeFromAdded(row)
			d.addToWaiting(row)
		} else {
			log.Warnf("Internal consistency error: could not find row with timetable id %d", rqRow.timetable_id)
		}
	}
}
示例#5
0
// process finished:
// 1. send an error to ev.errorCh, nil if all is ok
// 2. restore state upon failure
func (d *DispatcherData) processFinished(ev *FinishEvent) {
	var err error
	defer func() { ev.errorCh <- err }()

	row, ok := d.addedMap[ev.timetable_id]
	if !ok {

		if rowWaiting, ok := d.waitingMap[ev.timetable_id]; ok {
			log.Warningf("Got 'finished' event about waiting timetable_id: %d, class=%s, location=%s, row=%+v", ev.timetable_id, d.className, d.location, rowWaiting)
			err = fmt.Errorf("timetable id is waiting: %d, class=%s, location=%s", ev.timetable_id, d.className, d.location)
		} else {
			log.Warningf("Got 'finished' event about unknown timetable_id: %d, class=%s, location=%s", ev.timetable_id, d.className, d.location)
			err = fmt.Errorf("Unknown timetable id: %d, class=%s, location=%s", ev.timetable_id, d.className, d.location)
		}
		return
	}

	now := uint64(time.Now().Unix())

	// restore everything in case of error
	rowCopy := *row
	defer func() {
		if err != nil {
			log.Warnf("Restoring old tt row (error: %s) from %+v => %+v", err.Error(), row, rowCopy)
			*row = rowCopy
		} else {
			// TODO: update rusage estimation
		}
	}()

	if !ev.isInitial {
		if ev.success {
			row.finished_successfully = 1
		} else {
			row.finished_successfully = 0
		}

		row.finish_count++
		if !ev.success {
			row.retry_count++
		} else {
			row.retry_count = 0
		}
	}

	row.NextLaunchTs.Valid = false
	row.NextLaunchTs.Int64 = 0

	var ttl uint32
	if row.method == METHOD_RUN {
		ttl = row.settings.ttl
	}

	// we should not delete entries that have ttl > 0 and have hit max retries because there is "repeat" field still
	shouldDelete := d.killRequest != nil ||
		(ttl == 0 && (ev.success || row.retry_count >= row.settings.max_retries)) ||
		(ttl > 0 && now > row.created+uint64(ttl))

	cb := func(tx *db.LazyTrx) error {
		var err error

		if ev.run_id != 0 {
			if ev.deleteRq {
				err = deleteFromRunQueue(tx, []uint64{ev.run_id}, ev.prevStatus)
			} else {
				err = errors.New("unexpected deleteRq value")
			}

			if err != nil {
				return err
			}
		}

		if shouldDelete {
			return deleteAddedFromTimetable(tx, []uint64{ev.timetable_id})
		}

		return logTTFinish(tx, row, ev.havePrevFinishCount, ev.prevFinishCount)
	}

	if shouldDelete {
		if err = db.DoInLazyTransaction(cb); err == nil {
			if row.id != ev.timetable_id {
				log.Warnf("Inconsistency of addedMap[%d] = row = %+v", ev.timetable_id, row)
				row.id = ev.timetable_id
			}
			d.removeFromAdded(row)
			d.deletedIds[ev.timetable_id] = DELETE_IDS_KEEP_GENERATIONS

			d.checkZero("processFinished")

			trigger(d.redispatchCh, "redispatch")
		} else {
			log.Warnf("could not process finished: %s", err.Error())
		}

		return
	}

	next_launch_ts := int64(now)

	if ev.success && row.added_to_queue_ts.Valid {
		next_launch_ts = row.added_to_queue_ts.Int64 + row.repeat.Int64
	} else if !ev.success {
		if row.retry_count < 3 {
			next_launch_ts += int64(row.default_retry)
		} else {
			e := row.retry_count - 2
			if e >= 3 {
				e = 3
			}

			next_launch_ts += (1 << e) * int64(row.default_retry)
		}
	}

	row.NextLaunchTs.Valid = true
	row.NextLaunchTs.Int64 = next_launch_ts

	row.finished_ts.Valid = false
	row.finished_ts.Int64 = 0

	row.added_to_queue_ts.Valid = false
	row.added_to_queue_ts.Int64 = 0

	if err = db.DoInLazyTransaction(cb); err == nil {
		d.removeFromAdded(row)
		d.addToWaiting(row)

		trigger(d.redispatchCh, "redispatch")
	}
}
示例#6
0
文件: launcher.go 项目: badoo/thunder
func (d *LauncherData) processWaiting() {
	//	invalidEntries := make([]uint64, 0)
	var rawResp proto.Message

	for run_id, row := range d.waitingMap {
		err := db.DoInLazyTransaction(func(tx *db.LazyTrx) error {
			return setRunStatusToInit(tx, run_id, row.settings.max_time)
		})

		if err != nil {
			log.Errorf("Could not update run status of run_id=%d to %s: %s", run_id, RUN_STATUS_INIT, err.Error())
			return
		}

		// TODO: add host unreachable check

		d.updateStatus(row, RUN_STATUS_INIT)
		row.max_finished_ts.Int64 = row.created.Int64 + int64(row.settings.max_time)
		row.max_finished_ts.Valid = true

		script := getScriptPath(row.settings)

		params := []string{
			fmt.Sprintf("--id=%d", row.Id),
			row.ClassName,
			fmt.Sprintf("--instance-count=%d", row.settings.instance_count),
			fmt.Sprintf("--settings-id=%d", row.settings_id),
			fmt.Sprintf("--method=%s", row.method),
			fmt.Sprintf("--token=%s", row.token),
			fmt.Sprintf("--retry-attempt=%d", row.retry_attempt),
			fmt.Sprintf("--max-retries=%d", row.settings.max_retries),
			fmt.Sprintf("--max-ts=%d", row.created.Int64+int64(row.settings.max_time)),
			fmt.Sprintf("--force-sf-db=%s", db.GetDbName()),
		}

		if row.settings.named_params.Valid && row.settings.named_params.String != "" {
			params = append(params, fmt.Sprintf("--named-params=%s", row.settings.named_params.String))
		}

		if row.JobData != "" {
			params = append(params, fmt.Sprintf("--job-data=%s", row.JobData))
		}

		if testId := os.Getenv("PHPUNIT_SELENIUM_TEST_ID"); testId != "" {
			params = append(params, fmt.Sprintf("--PHPUNIT_SELENIUM_TEST_ID=%s", testId))
		}

		if row.settings.debug_enabled == 1 && row.settings.created > time.Now().Unix()-DEBUG_TIMEOUT {
			params = append(params, "--debug-mode")
		}

		if row.settings.profiling_enabled == 1 && row.settings.created > time.Now().Unix()-PROFILING_TIMEOUT {
			params = append(params, "--enable-profiling")
		}

		if row.timetable_id.Valid && row.timetable_id.Int64 != 0 {
			params = append(params, fmt.Sprintf("--timetable-id=%d", row.timetable_id.Int64))
		}

		ev := &badoo_phproxyd.RequestRun{
			Script:       proto.String(script),
			Hash:         proto.Uint64(row.Id),
			Tag:          proto.String(PHPROXY_TAG),
			Force:        proto.Int32(1),
			Params:       params,
			Store:        badoo_phproxyd.StoreT_FILES.Enum(),
			FreeAfterRun: proto.Bool(false),
		}

		_, rawResp, err = d.call(ev)
		if err != nil {
			continue
		}

		resp, ok := rawResp.(*badoo_phproxyd.ResponseGeneric)
		if !ok {
			log.Errorf("Unexpected response from host %s when doing run, type: %T, response: %+v", d.hostname, rawResp, rawResp)
			continue
		}

		if resp.GetErrorCode() != 0 {
			log.Errorf("Unexpected response from host %s when doing run, got code %d and text %s", d.hostname, resp.GetErrorCode(), resp.GetErrorText())
			continue
		}
	}
}
示例#7
0
文件: api.go 项目: badoo/thunder
func APIAcceptTTJobs(jobs []*thunder.RequestAddJobsJobT) ([]uint64, error) {
	now := uint64(time.Now().Unix())

	classLocType := make(map[string]string) // class_name => location_type
	ttRows := make([]*TimetableEntry, 0, len(jobs))

	perClassLoc := make(map[string]map[string][]*TimetableEntry)

	for _, row := range jobs {
		settings, err := getScriptSettings(row.GetClassName())
		if err != nil {
			return nil, err
		}

		classLocType[row.GetClassName()] = settings.location_type

		jrow := new(TimetableEntry)
		jrow.class_name = row.GetClassName()

		if row.Repeat == nil {
			jrow.repeat = settings.repeat_job
		} else {
			if row.Repeat.Value == nil {
				jrow.repeat.Valid = false
			} else {
				jrow.repeat.Valid = true
				jrow.repeat.Int64 = int64(row.Repeat.GetValue())
			}
		}

		jrow.default_retry = settings.retry_job

		jrow.created = now

		if row.SettingsId == nil {
			jrow.settings_id = settings.id
			jrow.settings = settings
		} else {
			jrow.settings_id = row.GetSettingsId()
			allSettingsMutex.Lock()
			jrow.settings = allSettings[jrow.settings_id]
			allSettingsMutex.Unlock()

			if jrow.settings == nil {
				ids := make(map[uint64]bool)
				ids[jrow.settings_id] = true
				err := loadNewIds(ids)
				if err != nil {
					return nil, err
				}

				allSettingsMutex.Lock()
				jrow.settings = allSettings[jrow.settings_id]
				allSettingsMutex.Unlock()

				if jrow.settings == nil {
					return nil, errors.New(fmt.Sprintf("Incorrect value of settings_id: %v", jrow.settings_id))
				}
			}

			if jrow.settings.location_type != settings.location_type {
				return nil, errors.New(fmt.Sprintf("You are not allowed to specify settings_id that has different location_type, row: %+v", row))
			}
		}

		jrow.NextLaunchTs.Valid = true

		if row.NextLaunchTs == nil {
			jrow.NextLaunchTs.Int64 = int64(now)
		} else {
			jrow.NextLaunchTs.Int64 = row.GetNextLaunchTs()
		}

		if row.Location == nil {
			jrow.location = settings.location
		} else {
			jrow.location = row.GetLocation()

			if settings.location_type == LOCATION_TYPE_ANY && jrow.settings.location != settings.location {
				return nil, errors.New(fmt.Sprintf("For location_type=any scripts location field must be equal to current settings: %+v", row))
			}
		}

		jrow.JobData = row.GetJobData()

		if row.Method == nil {
			jrow.method = METHOD_RUN
		} else {
			jrow.method = row.GetMethod()
		}

		if row.GenerationId == nil {
			jrow.generation_id.Valid = false
		} else {
			jrow.generation_id.Valid = true
			jrow.generation_id.Int64 = row.GetGenerationId()
		}

		ttRows = append(ttRows, jrow)

		el, ok := perClassLoc[jrow.class_name]
		if !ok {
			el = make(map[string][]*TimetableEntry)
			perClassLoc[jrow.class_name] = el
		}

		el[jrow.location] = append(el[jrow.location], jrow)
	}

	for className, locRows := range perClassLoc {
		for location, rows := range locRows {
			key := DEFAULT_LOCATION_IDX
			if classLocType[className] == LOCATION_TYPE_EACH {
				key = location
			}

			currentCnt := 0
			if ch := getDispatcherJobsCountCh(className, key); ch != nil {
				respCh := make(chan int, 1)
				ch <- &JobsCountRequest{RespCh: respCh}
				currentCnt = <-respCh
			}

			if currentCnt+len(rows) > MAX_API_JOBS {
				return nil, fmt.Errorf("Too many jobs: %d (current) + %d (adding) > %d (max)", currentCnt, len(rows), MAX_API_JOBS)
			}
		}
	}

	err := db.DoInLazyTransaction(func(tx *db.LazyTrx) error {
		return addToTimetable(tx, ttRows)
	})

	if err != nil {
		return nil, err
	}

	for className, locRows := range perClassLoc {
		for location, rows := range locRows {
			key := DEFAULT_LOCATION_IDX
			if classLocType[className] == LOCATION_TYPE_EACH {
				key = location
			}

			notifyAboutNewTTRows(className, key, rows, false)
		}
	}

	ids := make([]uint64, 0, len(ttRows))
	for _, row := range ttRows {
		ids = append(ids, row.id)
	}

	return ids, nil
}