Пример #1
0
func (d *DispatcherData) checkDeleted() {
	checkIds := make([]uint64, 0, len(d.waitingMap)+len(d.addedMap))

	for _, row := range d.waitingMap {
		checkIds = append(checkIds, row.id)
	}

	for _, row := range d.addedMap {
		checkIds = append(checkIds, row.id)
	}

	existing, err := getExistingTTIds(checkIds)
	if err != nil {
		log.Warnf("Could not check existing tt ids for class=%s and location=%s: %s", d.className, d.location, err.Error())
		return
	}

	for _, row := range d.waitingMap {
		if !existing[row.id] {
			log.Warnf("Timetable row disappeared from waiting state in DB: %+v", row)
			d.removeFromWaiting(row)
		}
	}

	for _, row := range d.addedMap {
		if !existing[row.id] {
			log.Warnf("Timetable row disappeared from added state in DB: %+v", row)
			d.removeFromAdded(row)
		}
	}
}
Пример #2
0
func (d *LauncherData) addToMaps(row *RunQueueEntry) {
	if row.settings == nil {
		buf := make([]byte, 5000)
		n := runtime.Stack(buf, false)
		log.Warningf("Incorrect row in run queue (addToMaps), settings are invalid: %+v\ntrace:%s", row, buf[0:n])
		return
	}
	if m := d.getMapByStatus(row.RunStatus); m != nil {
		if LAUNCHER_DB_DEBUG {
			log.Printf("RQ row from db: id=%d, class=%s, job_data=%s, hostname=%s", row.Id, row.ClassName, row.JobData, row.hostname)
		}
		if d.allMap[row.Id] != nil {
			log.Warnf("Trying to add already added into run_queue (all map): %+v", row)
			return
		}
		if m[row.Id] != nil {
			log.Warnf("Trying to add already added into run_queue (own map): %+v", row)
			return
		}
		m[row.Id] = row
		d.allMap[row.Id] = row
	} else {
		log.Warnf("Broken run status: %+v", row)
	}
}
Пример #3
0
func (d *LauncherData) processWaitingInInit() {
	for run_id, cachedRow := range d.initMap {
		_, rawResp, err := d.call(&badoo_phproxyd.RequestCheck{Hash: proto.Uint64(run_id)})

		if shouldHaveFinished(cachedRow) {
			d.logIncorrectFinish(nil, cachedRow, RUN_STATUS_INIT)
			continue
		}

		if err != nil {
			continue
		}

		row, err := getRunningInfo(run_id)
		if err != nil {
			log.Warnf("Could not get running info from DB in method processWaitingInInit for hostname=%s, class=%s, run id=%d, err: %s", d.hostname, cachedRow.ClassName, run_id, err.Error())
			continue
		}

		if cachedRow.settings_id != row.settings_id {
			log.Warnf("Broken row in cache or db for id=%d, settings_id is different (cache=%d, db=%d)", run_id, cachedRow.settings_id, row.settings_id)
			continue
		}

		row.settings = cachedRow.settings

		switch resp := rawResp.(type) {
		case *badoo_phproxyd.ResponseCheck:
			d.logIncorrectFinish(resp, row, row.RunStatus)
		case *badoo_phproxyd.ResponseGeneric:
			result := -badoo_phproxyd.Errno(resp.GetErrorCode())

			if result == badoo_phproxyd.Errno_ERRNO_NOT_FOUND || result == badoo_phproxyd.Errno_ERRNO_ALREADY_RUNNING {
				// we should log intelligently when server is so slow that it does not launch script in 0.5 sec
				if time.Now().Unix()-INIT_TIMEOUT_SEC > row.should_init_ts.Int64 {
					action := KILL_ACTION_NO_ACTION
					if result == badoo_phproxyd.Errno_ERRNO_NOT_FOUND {
						action = KILL_ACTION_LOG_SCRIPT_FINISH_INIT
					}

					d.terminate(row, action)
				} else if result == badoo_phproxyd.Errno_ERRNO_NOT_FOUND {
					d.terminate(row, KILL_ACTION_SET_WAITING)
				}
			} else if result == badoo_phproxyd.Errno_ERRNO_FAILED_FINISHED {
				log.Warningf("Script %s finished with failure at %s", row.ClassName, d.hostname)
				d.logIncorrectFinish(nil, row, RUN_STATUS_INIT)
			} else if result == badoo_phproxyd.Errno_ERRNO_WAIT_FOR_FREE {
				log.Warningf("Waiting in init: Lost results for %s at %s", row.ClassName, d.hostname)
				d.logIncorrectFinish(nil, row, RUN_STATUS_INIT)
			} else {
				log.Warningf("Unexpected return code %d (%s) for check request for class=%s, hostname=%s", result, result, row.ClassName, d.hostname)
			}
		default:
			log.Warningf("Received unexpected result from phproxyd at %s: type %T, result: %v", d.hostname, rawResp, rawResp)
		}
	}
}
Пример #4
0
func (d *DispatcherData) processAllJobs(jobs []*TimetableEntry) {
	newJobs := make([]*TimetableEntry, 0)
	toProcessFinished := make([]*FinishEvent, 0)

	for _, row := range jobs {
		if el, ok := d.waitingMap[row.id]; ok {

			if row.finish_count >= el.finish_count && row.added_to_queue_ts.Valid && !row.finished_ts.Valid {
				d.removeFromWaiting(el)
				row.reportedDup = el.reportedDup
				d.addToAdded(row)

				log.Warnf("external waiting to added promotion OLD:%+v---NEW:%+v", el, row)
			} else if row.finish_count > el.finish_count && !row.added_to_queue_ts.Valid {
				d.removeFromWaiting(el)
				d.addToWaiting(row)
				log.Warnf("external finish_count promotion OLD:%+v---NEW:%+v", el, row)
			}
			continue
		}

		if el, ok := d.addedMap[row.id]; ok {
			// External job notification can update timetable by incrementing it's finish count.
			// We do not really care about any modifications made by the script except for finished_successfully field
			if row.finish_count > el.finish_count {
				log.Debugf("External finish for tt row %+v", row)
				toProcessFinished = append(toProcessFinished, &FinishEvent{
					timetable_id: row.id, success: row.finished_successfully != 0,
					havePrevFinishCount: true, prevFinishCount: row.finish_count, errorCh: make(chan error, 1)})
			}
		} else if d.deletedIds[row.id] == 0 {
			log.Debugf("External timetable row %+v", row)
			newJobs = append(newJobs, row)
		}
	}

	if len(newJobs) > 0 {
		d.acceptNewJobs(newJobs)
	}

	for _, el := range toProcessFinished {
		log.Debugf("processFinished: %+v", el)
		d.processFinished(el)
	}

	for ttId, refCount := range d.deletedIds {
		if refCount--; refCount <= 0 {
			delete(d.deletedIds, ttId)
		} else {
			d.deletedIds[ttId] = refCount
		}
	}
}
Пример #5
0
func (d *DispatcherData) addToAdded(row *TimetableEntry) {
	if el, ok := d.addedMap[row.id]; ok {
		log.Warnf("attempted to add already added OLD:%+v NEW:%+v", el, row)
		d.printTrace()
		return
	}
	if el, ok := d.waitingMap[row.id]; ok {
		log.Warnf("attempted to add already waiting OLD:%+v NEW:%+v", el, row)
		d.printTrace()
		return
	}
	d.addedMap[row.id] = row
	d.addedJobData[row.JobData] = row.id
}
Пример #6
0
func (d *DispatcherData) addToWaiting(row *TimetableEntry) {
	if el, ok := d.addedMap[row.id]; ok {
		log.Warnf("attempted to wait already added OLD:%+v NEW:%+v", el, row)
		d.printTrace()
		return
	}
	if el, ok := d.waitingMap[row.id]; ok {
		log.Warnf("attempted to wait already waiting OLD:%+v NEW:%+v", el, row)
		d.printTrace()
		return
	}
	heap.Push(&d.waitingList, row)
	d.waitingMap[row.id] = row
}
Пример #7
0
func (d *DispatcherData) removeFromWaiting(row *TimetableEntry) {
	if _, ok := d.waitingMap[row.id]; !ok {
		log.Warnf("attempted to unwait nonexistent ROW:%+v", row)
		d.printTrace()
		return
	}
	if el, ok := d.addedMap[row.id]; ok {
		log.Warnf("attempted to unwait added OLD:%+v NEW:%+v", el, row)
		d.printTrace()
		return
	}
	heap.Remove(&d.waitingList, row.index)
	delete(d.waitingMap, row.id)
}
Пример #8
0
func (d *DispatcherData) removeFromAdded(row *TimetableEntry) {
	if _, ok := d.addedMap[row.id]; !ok {
		log.Warnf("attempted to unadd nonexistent ROW:%+v", row)
		d.printTrace()
		return
	}
	if el, ok := d.waitingMap[row.id]; ok {
		log.Warnf("attempted to unadd waiting OLD:%+v NEW:%+v", el, row)
		d.printTrace()
		return
	}

	delete(d.addedMap, row.id)
	delete(d.addedJobData, row.JobData)
}
Пример #9
0
func (d *LauncherData) acceptWaiting(jobs []*RunQueueEntry) {
	idsToSelect := make([]uint64, 0)
	settingsMap := make(map[uint64]*ScriptSettings)

	for _, row := range jobs {
		if row.settings == nil {
			log.Warningf("Incorrect row in run queue (waitingCh), settings are invalid: %+v", row)
			continue
		}

		if d.allMap[row.Id] != nil {
			continue
		}
		settingsMap[row.Id] = row.settings
		idsToSelect = append(idsToSelect, row.Id)
	}

	rqs, err := getRunningInfos(idsToSelect)
	if err != nil {
		log.Warnf("acceptWaiting could not select run_queue entries: %s", err.Error())
		return
	}
	for _, row := range rqs {
		row.settings = settingsMap[row.Id]
		d.addToMaps(row)
	}

	d.processWaiting()
}
Пример #10
0
func (d *DispatcherData) processNewJobsEv(ev *NewJobs) {
	ids := make([]uint64, 0, len(ev.rows))
	settingsMap := make(map[uint64]*ScriptSettings)

	for _, row := range ev.rows {
		if row.settings == nil {
			log.Warningf("Incorrect row in timetable (processNewJobsEv), settings are invalid: %+v", row)
			continue
		}

		if d.waitingMap[row.id] != nil || d.addedMap[row.id] != nil {
			continue
		}

		settingsMap[row.id] = row.settings
		ids = append(ids, row.id)
	}

	rows, err := selectTimetableByIds(ids)
	if err != nil {
		log.Warnf("could not select tt_enties for ids:%+v err:%s", ids, err.Error())
		return
	}

	for _, row := range rows {
		row.settings = settingsMap[row.id]
	}

	d.acceptNewJobs(rows)
}
Пример #11
0
func WriteLogsThread(filename string) {
	log.Infof("Started write logs thread to file=%s", filename)

	reopenTick := time.Tick(time.Second * 10)

	fp, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
	var wr *bufio.Writer

	if err != nil {
		log.Errorf("Could not open %s: %s", filename, err.Error())
	} else {
		wr = bufio.NewWriterSize(fp, 65536)
	}

	for {
		select {
		case <-reopenTick:
			if fp != nil {
				fp.Close()
			}

			fp, err = os.OpenFile(filename, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
			if err != nil {
				log.Warnf("Could not reopen %s: %s", err.Error())
				wr = nil
				fp = nil
			} else {
				wr = bufio.NewWriterSize(fp, 65536)
			}
		case ev := <-rqhLog:
			l := len(rqhLog)
			evs := make([]*FinishResult, 0, l+1)
			evs = append(evs, ev)

			for i := 0; i < l; i++ {
				evs = append(evs, <-rqhLog)
			}

			if wr != nil {
				encoder := json.NewEncoder(wr)

				for _, e := range evs {
					if err = encoder.Encode(e); err != nil {
						log.Errorf("Could not write to %s: %s", filename, err.Error())
					}
				}

				if err = wr.Flush(); err != nil {
					log.Errorf("Could not flush contents to %s: %s", filename, err.Error())
				}
			} else {
				log.Errorf("Failed to write %d events to rqh log because file %s could not be opened", len(evs), filename)
			}
		}

	}
}
Пример #12
0
func FinalizeRestartWithTimeout() {

	currentRestart.Child.Kill()
	child_state, err := currentRestart.Child.Wait()
	log.Warnf("child %d failed to start and was killed, state: %v, err: %v", currentRestart.Child.Pid, child_state, err)

	currentRestart.MovePidfileBack()
	currentRestart = nil
}
Пример #13
0
func (d *LauncherData) delFromMaps(id uint64) {
	row := d.allMap[id]
	if row == nil {
		log.Warnf("Trying to delete nonexistent from run_queue: %+v", row)
		return
	}
	if m := d.getMapByStatus(row.RunStatus); m != nil {
		if m[id] == nil {
			log.Warnf("Run queue entry missing in its map when deleting: %+v", row)
			return
		}
		delete(m, id)
	} else {
		log.Warnf("Broken run status when deleting: %+v", row)
		return
	}
	delete(d.allMap, id)
	d.deletedIds[id] = DELETE_IDS_KEEP_GENERATIONS
}
Пример #14
0
func getHostname() string {
	hostname, err := os.Hostname()
	if err != nil {
		log.Warnf("can't get local hostname: %v", err)
		return "-"
	}

	cname, err := net.LookupCNAME(hostname)
	if err != nil {
		log.Warnf("can't get FQDN for %s (using as is): %v", hostname, err)
		return hostname
	}

	if cname[len(cname)-1] == '.' {
		cname = cname[:len(cname)-1]
	}

	return cname
}
Пример #15
0
func FinalizeRestartWithError(proc_status RestartProcStatus) {

	if proc_status.Err != nil {
		log.Errorf("couldn't collect state for child %d, %v", currentRestart.Child.Pid, proc_status.Err)
	}
	log.Warnf("child %d failed to start, collected %v", currentRestart.Child.Pid, proc_status.State)

	// not waiting for child, so have to release
	currentRestart.Child.Release()

	currentRestart.MovePidfileBack()
	currentRestart = nil
}
Пример #16
0
func (server *Server) sendToPinba(req *pinba.Request, script_name string, duration time.Duration) {
	if server.pinbaSender == nil {
		return
	}

	req.ScriptName = script_name
	req.RequestCount = 1
	req.RequestTime = duration

	err := server.pinbaSender.Send(req)
	if err != nil { // this is pretty unlikely, therefore don't care about performance inside this 'if'

		shouldLog := func() bool {
			opErr, ok := err.(*net.OpError)
			if !ok {
				return true
			}

			syscallErr, ok := opErr.Err.(*os.SyscallError)
			if !ok {
				return true
			}

			errno, ok := syscallErr.Err.(syscall.Errno)
			if !ok {
				return true
			}

			if errno == syscall.ECONNREFUSED {
				// NOTE(antoxa): see CT-2394
				// just ignore connection refused errors, those happen when pinba is unavailable, but configured (since we use UDP)
				// we kinda lose on error reporting that nothing is logged to pinba, but in practice the message is just annoying
				return false
			}

			return true
		}()

		if shouldLog {
			log.Warnf("sendToPinba() failed: %v", err)
		}
	}
}
Пример #17
0
func startKilling(className string) {
	killerThreads.Lock()
	defer killerThreads.Unlock()

	dispatchThreads.Lock()
	defer dispatchThreads.Unlock()

	locs, ok := dispatchThreads.v[className]
	if !ok {
		return
	}

	killMap := killerThreads.v[className]
	if killMap == nil {
		killMap = make(map[string]bool)
		killerThreads.v[className] = killMap
	}

	for loc, dt := range locs {
		if killMap[loc] {
			continue
		}

		req := &KillRequest{
			ResCh: make(chan error, 1),
		}

		log.Printf("Sending kill request to class=%s, location=%s", className, loc)
		select {
		case dt.killRequestCh <- req:
			killMap[loc] = true
		default:
			log.Warnf("Could not send kill request to class=%s, location=%s, kill channel was busy", className, loc)
		}
	}
}
Пример #18
0
// Call this when you want to start your servers and stuff
func EventLoop(ports []Port) {
	defer log.Debug("exiting")

	initPhaseDuration = time.Since(startupTime)

	daemonConfig := config.GetDaemonConfig()

	// service-stats ports
	ports = append(ports, GpbPort("service-stats-gpb", stats_ctx, badoo_service.Gpbrpc))
	ports = append(ports, JsonPort("service-stats-gpb/json", stats_ctx, badoo_service.Gpbrpc))

	// build map of ports and do some sanity checks
	ph := make(map[string]*Port)
	for i := 0; i < len(ports); i++ {
		p := &ports[i]
		ph[p.Name] = p

		// json and gpb ports should have the same context
		//  so try and warn user about passing plain values in (as it makes a copy)
		if reflect.ValueOf(p.Handler).Kind() != reflect.Ptr {
			log.Infof("port[%d].Handler should be a pointer (you want gpbs and json to use the same context, right?) (now: %T)", i, p.Handler)
		}
	}

	getRestartSocket := func(rcd *RestartChildData, portName, portAddr string) (*RestartSocket, *os.File) {
		if rcd == nil {
			return nil, nil
		}

		restartSocket, exists := rcd.GpbrpcSockets[portName]
		if exists == false {
			return nil, nil
		}

		restartFile := os.NewFile(restartSocket.Fd, "")

		if restartSocket.Address != portAddr {
			return nil, restartFile
		}

		return &restartSocket, restartFile
	}

	// start 'em all
	for _, lcf := range daemonConfig.GetListen() {
		portName, portAddr := lcf.GetProto(), lcf.GetAddress()
		port := ph[portName]

		if nil == port {
			log.Warnf("ignoring unknown port: %s at %s", portName, portAddr)
			continue
		}

		if port.IsStarted {
			log.Warnf("ignoring double startup for port: %s at %s", portName, portAddr)
		}

		listener, err := func() (listener net.Listener, err error) { // it's important that this should be a function, see defer inside
			restartSocket, restartFile := getRestartSocket(restartData, portName, portAddr)

			// this whole fd/file affair is very inconvenient to
			//  since when getRestartSocket() returns fd - it can't close it yet, as it can be used by FileListener
			defer restartFile.Close()

			if restartSocket == nil {
				listener, err = net.Listen("tcp", portAddr)
				if err != nil {
					log.Errorf("listen failed for server %s at %s: %s", portName, portAddr, err)
					return
				}
				log.Infof("port %s bound to address %s", portName, listener.Addr())

			} else {

				listener, err = net.FileListener(restartFile) // this dup()-s
				if err != nil {
					log.Errorf("failed to grab parent fd %d for %s at %s: %s", restartSocket.Fd, portName, portAddr, err)
					return
				}

				log.Infof("port %s bound to address %s (parent fd: %d)", portName, listener.Addr(), restartSocket.Fd)
			}
			return
		}()

		if err != nil {
			os.Exit(1)
		}

		// enable pinba only for ports that explicitly request it
		ps := func() gpbrpc.PinbaSender {
			if !lcf.GetPinbaEnabled() {
				return nil // explicit nil here
			}

			if pinbaSender == nil {
				log.Warnf("pinba is not configured, but pinba_enabled IS set for port %s: %s", portName, portAddr)
				return nil // explicit nil here
			}

			log.Infof("pinba configured for port %s:%s -> %s", portName, portAddr, pinbaSender.Address)
			return pinbaSender
		}()

		// slow request log time
		slowRequestTime := time.Duration(daemonConfig.GetSlowRequestMs()) * time.Millisecond

		srv := &Server{
			Name:    lcf.GetProto(),
			Address: lcf.GetAddress(),
			Server:  gpbrpc.NewServer(listener, port.Proto, port.Codec, port.Handler, ps, slowRequestTime),
		}
		go srv.Server.Serve()

		port.IsStarted = true
		StartedServers[port.Name] = srv // save it for laterz
	}

	// kill parent if this is a child of graceful restart
	if restartData != nil {
		syscall.Kill(restartData.PPid, syscall.SIGQUIT)
	}

	log.Infof("entering event loop")

	exitMethod := wait_for_signals()

	if exitMethod == EXIT_GRACEFULLY { // wait for established connections to close and then die

		// FIXME: should stop servers from accepting new connections here!

		const ATTEMPTS_PER_SEC = 2
		maxAttempts := daemonConfig.GetParentWaitTimeout() * ATTEMPTS_PER_SEC

		for i := uint32(0); i < maxAttempts; i++ {
			for _, srv := range StartedServers {
				currConn := atomic.LoadUint64(&srv.Server.Stats.ConnCur)
				if currConn > 0 {
					log.Debugf("%s still has %d connections", srv.Name, currConn)
					time.Sleep(time.Second / ATTEMPTS_PER_SEC)
				}
			}
		}
	} else {
		// do nothing for EXIT_IMMEDIATELY
	}

	// doing cleanups here
	// XXX: can this be moved to defer at the start of this function?
	if pidfile != nil {
		pidfile.CloseAndRemove()
	}
}
Пример #19
0
func (d *DispatcherData) redispatch() {
	returnToWaitingList := make([]*TimetableEntry, 0)
	defer func() {
		for _, row := range returnToWaitingList {
			d.addToWaiting(row)
		}
	}()

	now := uint64(time.Now().Unix())

	newRqList := make([]*RunQueueEntry, 0)
	toDeleteFromWaitingList := make([]*TimetableEntry, 0)

	for l := d.waitingList.Len(); l > 0; l-- {
		row := heap.Pop(&d.waitingList).(*TimetableEntry)
		delete(d.waitingMap, row.id)

		if d.killRequest != nil {
			toDeleteFromWaitingList = append(toDeleteFromWaitingList, row)
			continue
		}

		if uint64(row.NextLaunchTs.Int64) > now {
			d.tickRedispatchCh = time.After(time.Second * time.Duration(uint64(row.NextLaunchTs.Int64)-now))
			returnToWaitingList = append(returnToWaitingList, row)
			break
		}

		if len(d.addedMap) >= row.settings.instance_count {
			returnToWaitingList = append(returnToWaitingList, row)
			break
		}

		if _, ok := d.addedJobData[row.JobData]; ok {
			if !row.reportedDup {
				log.Warningf("Duplicate job %s for class %s and location %s", row.JobData, d.className, row.location)
				row.reportedDup = true
			}

			returnToWaitingList = append(returnToWaitingList, row)
			continue
		}

		if row.method == METHOD_RUN && row.settings.ttl > 0 && now > row.created+uint64(row.settings.ttl) {
			if row.finish_count == 0 {
				log.Warningf("Job expired before being run even once: job %s for class %s and location %s", row.JobData, d.className, row.location)
			}
			toDeleteFromWaitingList = append(toDeleteFromWaitingList, row)
			continue
		}

		// do not try to dispatch next ones if selectHostname failed, and do not forget to return the row as well
		hostname, err := selectHostname(row.location, row.settings.location_type, d.rusage.cpu_usage, d.rusage.max_memory)
		if err != nil {
			logFailedLocation(row.settings, row.location, err.Error())
			d.tickRedispatchCh = time.After(time.Second)
			returnToWaitingList = append(returnToWaitingList, row)
			break
		} else {
			settings := row.settings
			if settings.location_type == LOCATION_TYPE_ANY && (settings.developer.String != "") && (settings.developer.String != "wwwrun") && ((now - uint64(settings.created)) <= DEVELOPER_CUSTOM_PATH_TIMEOUT) {
				hostname = DEVELOPER_DEBUG_HOSTNAME
			}
			log.Debugln("Selected ", hostname, " for ", row.location, " (loc_type=", settings.location_type, ")")
		}

		nullNow := sql.NullInt64{Valid: true, Int64: int64(now)}

		queueRow := &RunQueueEntry{
			ClassName:      d.className,
			timetable_id:   sql.NullInt64{Valid: true, Int64: int64(row.id)},
			generation_id:  row.generation_id,
			hostname:       hostname,
			hostname_idx:   getHostnameIdx(hostname),
			JobData:        row.JobData,
			method:         row.method,
			created:        nullNow,
			RunStatus:      RUN_STATUS_WAITING,
			waiting_ts:     nullNow,
			should_init_ts: nullNow,
			token:          row.token,
			retry_attempt:  row.retry_count,
			settings_id:    row.settings_id,
			settings:       row.settings,
		}

		newRqList = append(newRqList, queueRow)

		row.added_to_queue_ts.Valid = true
		row.added_to_queue_ts.Int64 = int64(now)

		d.addToAdded(row)
	}

	err := db.DoInLazyTransaction(func(tx *db.LazyTrx) error {
		return addToQueueAndDeleteExpired(tx, newRqList, toDeleteFromWaitingList)
	})

	if err == nil {
		for _, row := range toDeleteFromWaitingList {
			d.deletedIds[row.id] = DELETE_IDS_KEEP_GENERATIONS
		}

		// all rows can expire by TTL in this loop, so check if it the case and notify job generator about it
		if len(toDeleteFromWaitingList) > 0 {
			d.checkZero("redispatch")
		}

		if len(newRqList) > 0 {
			perHost := make(map[string][]*RunQueueEntry)
			for _, row := range newRqList {
				perHost[row.hostname] = append(perHost[row.hostname], row)
			}

			for hostname, rows := range perHost {
				notifyAboutNewRQRows(hostname, rows, false)
			}
		}

		return
	}

	d.tickRedispatchCh = time.After(time.Second)

	// restore internal structures back in case of error
	log.Warnf("Could not add to run queue for class %s and location %s to database: %s", d.className, d.location, err.Error())

	for _, rqRow := range newRqList {
		row, ok := d.addedMap[uint64(rqRow.timetable_id.Int64)]

		if ok {
			row.added_to_queue_ts.Valid = false
			row.added_to_queue_ts.Int64 = 0
			row.id = uint64(rqRow.timetable_id.Int64)
			d.removeFromAdded(row)
			d.addToWaiting(row)
		} else {
			log.Warnf("Internal consistency error: could not find row with timetable id %d", rqRow.timetable_id)
		}
	}
}
Пример #20
0
func (server *Server) serveConnection(conn net.Conn) {
	log.Debugf("accepted connection: %s -> %s", conn.LocalAddr(), conn.RemoteAddr())
	defer func() {
		atomic.AddUint64(&server.Stats.ConnCur, ^uint64(0)) // decrements the value, ref: http://golang.org/pkg/sync/atomic/#AddUint64
		log.Debugf("closing connection: %s -> %s", conn.LocalAddr(), conn.RemoteAddr())

		if server.onDisconnect != nil {
			server.onDisconnect(RequestT{
				Server: server,
				Conn:   conn,
			})
		}

		conn.Close()
	}()

	atomic.AddUint64(&server.Stats.ConnCur, 1)
	atomic.AddUint64(&server.Stats.ConnTotal, 1)

	for {
		// need these as var here, since goto might jump over their definition below
		var (
			result      ResultT
			request     RequestT
			startTime   time.Time
			pinbaReq    pinba.Request
			requestTime time.Duration
		)

		msgid, msg, bytes_read, status, err := server.Codec.ReadRequest(server.Proto, conn)
		if err != nil {
			if status == ConnEOF {
				// connection was closed gracefully from client side
				break
			}
			if status == ConnOK {
				// misunderstanding on protobuf level
				result = server.Proto.ErrorGeneric(err)

				// FIXME(antoxa): remove this goto, it's annoying to code around
				goto write_response
			}
			// IO error or similar
			log.Infof("aborting connection: %v", err)
			break
		}

		// FIXME(antoxa): stats will not be incremented when ReadRequest returns err != nil and status == ConnOK
		atomic.AddUint64(&server.Stats.BytesRead, uint64(bytes_read))
		atomic.AddUint64(&server.Stats.Requests, 1)

		request = RequestT{
			Server:    server,
			Conn:      conn,
			RequestId: atomic.AddUint64(&globalRequestId, 1),
			MessageId: msgid,
			Message:   msg,
			PinbaReq:  &pinbaReq,
		}

		if msgid < MaxMsgID {
			atomic.AddUint64(&server.Stats.RequestsIdStat[msgid], 1)
		}

		startTime = time.Now()

		result = server.Proto.Dispatch(request, server.Handler)

		requestTime = time.Since(startTime)

		// log slow request if needed
		if server.slowRequestTime > 0 && server.slowRequestTime <= requestTime {
			if msg != nil {
				requestInfo := func() string {
					reqName := MapRequestIdToName(server.Proto, msgid)
					body, err := json.Marshal(msg)
					if err != nil {
						return fmt.Sprintf("%s %v", reqName, err)
					}
					return fmt.Sprintf("%s %s", reqName, body)
				}()

				log.Warnf("slow request (%d ms): %s; addr: %s <- %s", requestTime/time.Millisecond, requestInfo, conn.LocalAddr(), conn.RemoteAddr())
			}
		}

		server.sendToPinba(&pinbaReq, server.pinbaReqNames[msgid], requestTime)

	write_response:

		if result.Action == ACTION_RESULT {
			writeln, err := server.Codec.WriteResponse(server.Proto, conn, result.Message)
			if err != nil {
				// write error: can't recover
				log.Infof("unrecoverable error while writing: %v", err)
				break
			}
			atomic.AddUint64(&server.Stats.BytesWritten, uint64(writeln))
		}

		if result.Action == ACTION_CLOSE {
			break
		}
	}
}
Пример #21
0
// process finished:
// 1. send an error to ev.errorCh, nil if all is ok
// 2. restore state upon failure
func (d *DispatcherData) processFinished(ev *FinishEvent) {
	var err error
	defer func() { ev.errorCh <- err }()

	row, ok := d.addedMap[ev.timetable_id]
	if !ok {

		if rowWaiting, ok := d.waitingMap[ev.timetable_id]; ok {
			log.Warningf("Got 'finished' event about waiting timetable_id: %d, class=%s, location=%s, row=%+v", ev.timetable_id, d.className, d.location, rowWaiting)
			err = fmt.Errorf("timetable id is waiting: %d, class=%s, location=%s", ev.timetable_id, d.className, d.location)
		} else {
			log.Warningf("Got 'finished' event about unknown timetable_id: %d, class=%s, location=%s", ev.timetable_id, d.className, d.location)
			err = fmt.Errorf("Unknown timetable id: %d, class=%s, location=%s", ev.timetable_id, d.className, d.location)
		}
		return
	}

	now := uint64(time.Now().Unix())

	// restore everything in case of error
	rowCopy := *row
	defer func() {
		if err != nil {
			log.Warnf("Restoring old tt row (error: %s) from %+v => %+v", err.Error(), row, rowCopy)
			*row = rowCopy
		} else {
			// TODO: update rusage estimation
		}
	}()

	if !ev.isInitial {
		if ev.success {
			row.finished_successfully = 1
		} else {
			row.finished_successfully = 0
		}

		row.finish_count++
		if !ev.success {
			row.retry_count++
		} else {
			row.retry_count = 0
		}
	}

	row.NextLaunchTs.Valid = false
	row.NextLaunchTs.Int64 = 0

	var ttl uint32
	if row.method == METHOD_RUN {
		ttl = row.settings.ttl
	}

	// we should not delete entries that have ttl > 0 and have hit max retries because there is "repeat" field still
	shouldDelete := d.killRequest != nil ||
		(ttl == 0 && (ev.success || row.retry_count >= row.settings.max_retries)) ||
		(ttl > 0 && now > row.created+uint64(ttl))

	cb := func(tx *db.LazyTrx) error {
		var err error

		if ev.run_id != 0 {
			if ev.deleteRq {
				err = deleteFromRunQueue(tx, []uint64{ev.run_id}, ev.prevStatus)
			} else {
				err = errors.New("unexpected deleteRq value")
			}

			if err != nil {
				return err
			}
		}

		if shouldDelete {
			return deleteAddedFromTimetable(tx, []uint64{ev.timetable_id})
		}

		return logTTFinish(tx, row, ev.havePrevFinishCount, ev.prevFinishCount)
	}

	if shouldDelete {
		if err = db.DoInLazyTransaction(cb); err == nil {
			if row.id != ev.timetable_id {
				log.Warnf("Inconsistency of addedMap[%d] = row = %+v", ev.timetable_id, row)
				row.id = ev.timetable_id
			}
			d.removeFromAdded(row)
			d.deletedIds[ev.timetable_id] = DELETE_IDS_KEEP_GENERATIONS

			d.checkZero("processFinished")

			trigger(d.redispatchCh, "redispatch")
		} else {
			log.Warnf("could not process finished: %s", err.Error())
		}

		return
	}

	next_launch_ts := int64(now)

	if ev.success && row.added_to_queue_ts.Valid {
		next_launch_ts = row.added_to_queue_ts.Int64 + row.repeat.Int64
	} else if !ev.success {
		if row.retry_count < 3 {
			next_launch_ts += int64(row.default_retry)
		} else {
			e := row.retry_count - 2
			if e >= 3 {
				e = 3
			}

			next_launch_ts += (1 << e) * int64(row.default_retry)
		}
	}

	row.NextLaunchTs.Valid = true
	row.NextLaunchTs.Int64 = next_launch_ts

	row.finished_ts.Valid = false
	row.finished_ts.Int64 = 0

	row.added_to_queue_ts.Valid = false
	row.added_to_queue_ts.Int64 = 0

	if err = db.DoInLazyTransaction(cb); err == nil {
		d.removeFromAdded(row)
		d.addToWaiting(row)

		trigger(d.redispatchCh, "redispatch")
	}
}