func (d *DispatcherData) checkDeleted() { checkIds := make([]uint64, 0, len(d.waitingMap)+len(d.addedMap)) for _, row := range d.waitingMap { checkIds = append(checkIds, row.id) } for _, row := range d.addedMap { checkIds = append(checkIds, row.id) } existing, err := getExistingTTIds(checkIds) if err != nil { log.Warnf("Could not check existing tt ids for class=%s and location=%s: %s", d.className, d.location, err.Error()) return } for _, row := range d.waitingMap { if !existing[row.id] { log.Warnf("Timetable row disappeared from waiting state in DB: %+v", row) d.removeFromWaiting(row) } } for _, row := range d.addedMap { if !existing[row.id] { log.Warnf("Timetable row disappeared from added state in DB: %+v", row) d.removeFromAdded(row) } } }
func (d *LauncherData) addToMaps(row *RunQueueEntry) { if row.settings == nil { buf := make([]byte, 5000) n := runtime.Stack(buf, false) log.Warningf("Incorrect row in run queue (addToMaps), settings are invalid: %+v\ntrace:%s", row, buf[0:n]) return } if m := d.getMapByStatus(row.RunStatus); m != nil { if LAUNCHER_DB_DEBUG { log.Printf("RQ row from db: id=%d, class=%s, job_data=%s, hostname=%s", row.Id, row.ClassName, row.JobData, row.hostname) } if d.allMap[row.Id] != nil { log.Warnf("Trying to add already added into run_queue (all map): %+v", row) return } if m[row.Id] != nil { log.Warnf("Trying to add already added into run_queue (own map): %+v", row) return } m[row.Id] = row d.allMap[row.Id] = row } else { log.Warnf("Broken run status: %+v", row) } }
func (d *LauncherData) processWaitingInInit() { for run_id, cachedRow := range d.initMap { _, rawResp, err := d.call(&badoo_phproxyd.RequestCheck{Hash: proto.Uint64(run_id)}) if shouldHaveFinished(cachedRow) { d.logIncorrectFinish(nil, cachedRow, RUN_STATUS_INIT) continue } if err != nil { continue } row, err := getRunningInfo(run_id) if err != nil { log.Warnf("Could not get running info from DB in method processWaitingInInit for hostname=%s, class=%s, run id=%d, err: %s", d.hostname, cachedRow.ClassName, run_id, err.Error()) continue } if cachedRow.settings_id != row.settings_id { log.Warnf("Broken row in cache or db for id=%d, settings_id is different (cache=%d, db=%d)", run_id, cachedRow.settings_id, row.settings_id) continue } row.settings = cachedRow.settings switch resp := rawResp.(type) { case *badoo_phproxyd.ResponseCheck: d.logIncorrectFinish(resp, row, row.RunStatus) case *badoo_phproxyd.ResponseGeneric: result := -badoo_phproxyd.Errno(resp.GetErrorCode()) if result == badoo_phproxyd.Errno_ERRNO_NOT_FOUND || result == badoo_phproxyd.Errno_ERRNO_ALREADY_RUNNING { // we should log intelligently when server is so slow that it does not launch script in 0.5 sec if time.Now().Unix()-INIT_TIMEOUT_SEC > row.should_init_ts.Int64 { action := KILL_ACTION_NO_ACTION if result == badoo_phproxyd.Errno_ERRNO_NOT_FOUND { action = KILL_ACTION_LOG_SCRIPT_FINISH_INIT } d.terminate(row, action) } else if result == badoo_phproxyd.Errno_ERRNO_NOT_FOUND { d.terminate(row, KILL_ACTION_SET_WAITING) } } else if result == badoo_phproxyd.Errno_ERRNO_FAILED_FINISHED { log.Warningf("Script %s finished with failure at %s", row.ClassName, d.hostname) d.logIncorrectFinish(nil, row, RUN_STATUS_INIT) } else if result == badoo_phproxyd.Errno_ERRNO_WAIT_FOR_FREE { log.Warningf("Waiting in init: Lost results for %s at %s", row.ClassName, d.hostname) d.logIncorrectFinish(nil, row, RUN_STATUS_INIT) } else { log.Warningf("Unexpected return code %d (%s) for check request for class=%s, hostname=%s", result, result, row.ClassName, d.hostname) } default: log.Warningf("Received unexpected result from phproxyd at %s: type %T, result: %v", d.hostname, rawResp, rawResp) } } }
func (d *DispatcherData) processAllJobs(jobs []*TimetableEntry) { newJobs := make([]*TimetableEntry, 0) toProcessFinished := make([]*FinishEvent, 0) for _, row := range jobs { if el, ok := d.waitingMap[row.id]; ok { if row.finish_count >= el.finish_count && row.added_to_queue_ts.Valid && !row.finished_ts.Valid { d.removeFromWaiting(el) row.reportedDup = el.reportedDup d.addToAdded(row) log.Warnf("external waiting to added promotion OLD:%+v---NEW:%+v", el, row) } else if row.finish_count > el.finish_count && !row.added_to_queue_ts.Valid { d.removeFromWaiting(el) d.addToWaiting(row) log.Warnf("external finish_count promotion OLD:%+v---NEW:%+v", el, row) } continue } if el, ok := d.addedMap[row.id]; ok { // External job notification can update timetable by incrementing it's finish count. // We do not really care about any modifications made by the script except for finished_successfully field if row.finish_count > el.finish_count { log.Debugf("External finish for tt row %+v", row) toProcessFinished = append(toProcessFinished, &FinishEvent{ timetable_id: row.id, success: row.finished_successfully != 0, havePrevFinishCount: true, prevFinishCount: row.finish_count, errorCh: make(chan error, 1)}) } } else if d.deletedIds[row.id] == 0 { log.Debugf("External timetable row %+v", row) newJobs = append(newJobs, row) } } if len(newJobs) > 0 { d.acceptNewJobs(newJobs) } for _, el := range toProcessFinished { log.Debugf("processFinished: %+v", el) d.processFinished(el) } for ttId, refCount := range d.deletedIds { if refCount--; refCount <= 0 { delete(d.deletedIds, ttId) } else { d.deletedIds[ttId] = refCount } } }
func (d *DispatcherData) addToAdded(row *TimetableEntry) { if el, ok := d.addedMap[row.id]; ok { log.Warnf("attempted to add already added OLD:%+v NEW:%+v", el, row) d.printTrace() return } if el, ok := d.waitingMap[row.id]; ok { log.Warnf("attempted to add already waiting OLD:%+v NEW:%+v", el, row) d.printTrace() return } d.addedMap[row.id] = row d.addedJobData[row.JobData] = row.id }
func (d *DispatcherData) addToWaiting(row *TimetableEntry) { if el, ok := d.addedMap[row.id]; ok { log.Warnf("attempted to wait already added OLD:%+v NEW:%+v", el, row) d.printTrace() return } if el, ok := d.waitingMap[row.id]; ok { log.Warnf("attempted to wait already waiting OLD:%+v NEW:%+v", el, row) d.printTrace() return } heap.Push(&d.waitingList, row) d.waitingMap[row.id] = row }
func (d *DispatcherData) removeFromWaiting(row *TimetableEntry) { if _, ok := d.waitingMap[row.id]; !ok { log.Warnf("attempted to unwait nonexistent ROW:%+v", row) d.printTrace() return } if el, ok := d.addedMap[row.id]; ok { log.Warnf("attempted to unwait added OLD:%+v NEW:%+v", el, row) d.printTrace() return } heap.Remove(&d.waitingList, row.index) delete(d.waitingMap, row.id) }
func (d *DispatcherData) removeFromAdded(row *TimetableEntry) { if _, ok := d.addedMap[row.id]; !ok { log.Warnf("attempted to unadd nonexistent ROW:%+v", row) d.printTrace() return } if el, ok := d.waitingMap[row.id]; ok { log.Warnf("attempted to unadd waiting OLD:%+v NEW:%+v", el, row) d.printTrace() return } delete(d.addedMap, row.id) delete(d.addedJobData, row.JobData) }
func (d *LauncherData) acceptWaiting(jobs []*RunQueueEntry) { idsToSelect := make([]uint64, 0) settingsMap := make(map[uint64]*ScriptSettings) for _, row := range jobs { if row.settings == nil { log.Warningf("Incorrect row in run queue (waitingCh), settings are invalid: %+v", row) continue } if d.allMap[row.Id] != nil { continue } settingsMap[row.Id] = row.settings idsToSelect = append(idsToSelect, row.Id) } rqs, err := getRunningInfos(idsToSelect) if err != nil { log.Warnf("acceptWaiting could not select run_queue entries: %s", err.Error()) return } for _, row := range rqs { row.settings = settingsMap[row.Id] d.addToMaps(row) } d.processWaiting() }
func (d *DispatcherData) processNewJobsEv(ev *NewJobs) { ids := make([]uint64, 0, len(ev.rows)) settingsMap := make(map[uint64]*ScriptSettings) for _, row := range ev.rows { if row.settings == nil { log.Warningf("Incorrect row in timetable (processNewJobsEv), settings are invalid: %+v", row) continue } if d.waitingMap[row.id] != nil || d.addedMap[row.id] != nil { continue } settingsMap[row.id] = row.settings ids = append(ids, row.id) } rows, err := selectTimetableByIds(ids) if err != nil { log.Warnf("could not select tt_enties for ids:%+v err:%s", ids, err.Error()) return } for _, row := range rows { row.settings = settingsMap[row.id] } d.acceptNewJobs(rows) }
func WriteLogsThread(filename string) { log.Infof("Started write logs thread to file=%s", filename) reopenTick := time.Tick(time.Second * 10) fp, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) var wr *bufio.Writer if err != nil { log.Errorf("Could not open %s: %s", filename, err.Error()) } else { wr = bufio.NewWriterSize(fp, 65536) } for { select { case <-reopenTick: if fp != nil { fp.Close() } fp, err = os.OpenFile(filename, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) if err != nil { log.Warnf("Could not reopen %s: %s", err.Error()) wr = nil fp = nil } else { wr = bufio.NewWriterSize(fp, 65536) } case ev := <-rqhLog: l := len(rqhLog) evs := make([]*FinishResult, 0, l+1) evs = append(evs, ev) for i := 0; i < l; i++ { evs = append(evs, <-rqhLog) } if wr != nil { encoder := json.NewEncoder(wr) for _, e := range evs { if err = encoder.Encode(e); err != nil { log.Errorf("Could not write to %s: %s", filename, err.Error()) } } if err = wr.Flush(); err != nil { log.Errorf("Could not flush contents to %s: %s", filename, err.Error()) } } else { log.Errorf("Failed to write %d events to rqh log because file %s could not be opened", len(evs), filename) } } } }
func FinalizeRestartWithTimeout() { currentRestart.Child.Kill() child_state, err := currentRestart.Child.Wait() log.Warnf("child %d failed to start and was killed, state: %v, err: %v", currentRestart.Child.Pid, child_state, err) currentRestart.MovePidfileBack() currentRestart = nil }
func (d *LauncherData) delFromMaps(id uint64) { row := d.allMap[id] if row == nil { log.Warnf("Trying to delete nonexistent from run_queue: %+v", row) return } if m := d.getMapByStatus(row.RunStatus); m != nil { if m[id] == nil { log.Warnf("Run queue entry missing in its map when deleting: %+v", row) return } delete(m, id) } else { log.Warnf("Broken run status when deleting: %+v", row) return } delete(d.allMap, id) d.deletedIds[id] = DELETE_IDS_KEEP_GENERATIONS }
func getHostname() string { hostname, err := os.Hostname() if err != nil { log.Warnf("can't get local hostname: %v", err) return "-" } cname, err := net.LookupCNAME(hostname) if err != nil { log.Warnf("can't get FQDN for %s (using as is): %v", hostname, err) return hostname } if cname[len(cname)-1] == '.' { cname = cname[:len(cname)-1] } return cname }
func FinalizeRestartWithError(proc_status RestartProcStatus) { if proc_status.Err != nil { log.Errorf("couldn't collect state for child %d, %v", currentRestart.Child.Pid, proc_status.Err) } log.Warnf("child %d failed to start, collected %v", currentRestart.Child.Pid, proc_status.State) // not waiting for child, so have to release currentRestart.Child.Release() currentRestart.MovePidfileBack() currentRestart = nil }
func (server *Server) sendToPinba(req *pinba.Request, script_name string, duration time.Duration) { if server.pinbaSender == nil { return } req.ScriptName = script_name req.RequestCount = 1 req.RequestTime = duration err := server.pinbaSender.Send(req) if err != nil { // this is pretty unlikely, therefore don't care about performance inside this 'if' shouldLog := func() bool { opErr, ok := err.(*net.OpError) if !ok { return true } syscallErr, ok := opErr.Err.(*os.SyscallError) if !ok { return true } errno, ok := syscallErr.Err.(syscall.Errno) if !ok { return true } if errno == syscall.ECONNREFUSED { // NOTE(antoxa): see CT-2394 // just ignore connection refused errors, those happen when pinba is unavailable, but configured (since we use UDP) // we kinda lose on error reporting that nothing is logged to pinba, but in practice the message is just annoying return false } return true }() if shouldLog { log.Warnf("sendToPinba() failed: %v", err) } } }
func startKilling(className string) { killerThreads.Lock() defer killerThreads.Unlock() dispatchThreads.Lock() defer dispatchThreads.Unlock() locs, ok := dispatchThreads.v[className] if !ok { return } killMap := killerThreads.v[className] if killMap == nil { killMap = make(map[string]bool) killerThreads.v[className] = killMap } for loc, dt := range locs { if killMap[loc] { continue } req := &KillRequest{ ResCh: make(chan error, 1), } log.Printf("Sending kill request to class=%s, location=%s", className, loc) select { case dt.killRequestCh <- req: killMap[loc] = true default: log.Warnf("Could not send kill request to class=%s, location=%s, kill channel was busy", className, loc) } } }
// Call this when you want to start your servers and stuff func EventLoop(ports []Port) { defer log.Debug("exiting") initPhaseDuration = time.Since(startupTime) daemonConfig := config.GetDaemonConfig() // service-stats ports ports = append(ports, GpbPort("service-stats-gpb", stats_ctx, badoo_service.Gpbrpc)) ports = append(ports, JsonPort("service-stats-gpb/json", stats_ctx, badoo_service.Gpbrpc)) // build map of ports and do some sanity checks ph := make(map[string]*Port) for i := 0; i < len(ports); i++ { p := &ports[i] ph[p.Name] = p // json and gpb ports should have the same context // so try and warn user about passing plain values in (as it makes a copy) if reflect.ValueOf(p.Handler).Kind() != reflect.Ptr { log.Infof("port[%d].Handler should be a pointer (you want gpbs and json to use the same context, right?) (now: %T)", i, p.Handler) } } getRestartSocket := func(rcd *RestartChildData, portName, portAddr string) (*RestartSocket, *os.File) { if rcd == nil { return nil, nil } restartSocket, exists := rcd.GpbrpcSockets[portName] if exists == false { return nil, nil } restartFile := os.NewFile(restartSocket.Fd, "") if restartSocket.Address != portAddr { return nil, restartFile } return &restartSocket, restartFile } // start 'em all for _, lcf := range daemonConfig.GetListen() { portName, portAddr := lcf.GetProto(), lcf.GetAddress() port := ph[portName] if nil == port { log.Warnf("ignoring unknown port: %s at %s", portName, portAddr) continue } if port.IsStarted { log.Warnf("ignoring double startup for port: %s at %s", portName, portAddr) } listener, err := func() (listener net.Listener, err error) { // it's important that this should be a function, see defer inside restartSocket, restartFile := getRestartSocket(restartData, portName, portAddr) // this whole fd/file affair is very inconvenient to // since when getRestartSocket() returns fd - it can't close it yet, as it can be used by FileListener defer restartFile.Close() if restartSocket == nil { listener, err = net.Listen("tcp", portAddr) if err != nil { log.Errorf("listen failed for server %s at %s: %s", portName, portAddr, err) return } log.Infof("port %s bound to address %s", portName, listener.Addr()) } else { listener, err = net.FileListener(restartFile) // this dup()-s if err != nil { log.Errorf("failed to grab parent fd %d for %s at %s: %s", restartSocket.Fd, portName, portAddr, err) return } log.Infof("port %s bound to address %s (parent fd: %d)", portName, listener.Addr(), restartSocket.Fd) } return }() if err != nil { os.Exit(1) } // enable pinba only for ports that explicitly request it ps := func() gpbrpc.PinbaSender { if !lcf.GetPinbaEnabled() { return nil // explicit nil here } if pinbaSender == nil { log.Warnf("pinba is not configured, but pinba_enabled IS set for port %s: %s", portName, portAddr) return nil // explicit nil here } log.Infof("pinba configured for port %s:%s -> %s", portName, portAddr, pinbaSender.Address) return pinbaSender }() // slow request log time slowRequestTime := time.Duration(daemonConfig.GetSlowRequestMs()) * time.Millisecond srv := &Server{ Name: lcf.GetProto(), Address: lcf.GetAddress(), Server: gpbrpc.NewServer(listener, port.Proto, port.Codec, port.Handler, ps, slowRequestTime), } go srv.Server.Serve() port.IsStarted = true StartedServers[port.Name] = srv // save it for laterz } // kill parent if this is a child of graceful restart if restartData != nil { syscall.Kill(restartData.PPid, syscall.SIGQUIT) } log.Infof("entering event loop") exitMethod := wait_for_signals() if exitMethod == EXIT_GRACEFULLY { // wait for established connections to close and then die // FIXME: should stop servers from accepting new connections here! const ATTEMPTS_PER_SEC = 2 maxAttempts := daemonConfig.GetParentWaitTimeout() * ATTEMPTS_PER_SEC for i := uint32(0); i < maxAttempts; i++ { for _, srv := range StartedServers { currConn := atomic.LoadUint64(&srv.Server.Stats.ConnCur) if currConn > 0 { log.Debugf("%s still has %d connections", srv.Name, currConn) time.Sleep(time.Second / ATTEMPTS_PER_SEC) } } } } else { // do nothing for EXIT_IMMEDIATELY } // doing cleanups here // XXX: can this be moved to defer at the start of this function? if pidfile != nil { pidfile.CloseAndRemove() } }
func (d *DispatcherData) redispatch() { returnToWaitingList := make([]*TimetableEntry, 0) defer func() { for _, row := range returnToWaitingList { d.addToWaiting(row) } }() now := uint64(time.Now().Unix()) newRqList := make([]*RunQueueEntry, 0) toDeleteFromWaitingList := make([]*TimetableEntry, 0) for l := d.waitingList.Len(); l > 0; l-- { row := heap.Pop(&d.waitingList).(*TimetableEntry) delete(d.waitingMap, row.id) if d.killRequest != nil { toDeleteFromWaitingList = append(toDeleteFromWaitingList, row) continue } if uint64(row.NextLaunchTs.Int64) > now { d.tickRedispatchCh = time.After(time.Second * time.Duration(uint64(row.NextLaunchTs.Int64)-now)) returnToWaitingList = append(returnToWaitingList, row) break } if len(d.addedMap) >= row.settings.instance_count { returnToWaitingList = append(returnToWaitingList, row) break } if _, ok := d.addedJobData[row.JobData]; ok { if !row.reportedDup { log.Warningf("Duplicate job %s for class %s and location %s", row.JobData, d.className, row.location) row.reportedDup = true } returnToWaitingList = append(returnToWaitingList, row) continue } if row.method == METHOD_RUN && row.settings.ttl > 0 && now > row.created+uint64(row.settings.ttl) { if row.finish_count == 0 { log.Warningf("Job expired before being run even once: job %s for class %s and location %s", row.JobData, d.className, row.location) } toDeleteFromWaitingList = append(toDeleteFromWaitingList, row) continue } // do not try to dispatch next ones if selectHostname failed, and do not forget to return the row as well hostname, err := selectHostname(row.location, row.settings.location_type, d.rusage.cpu_usage, d.rusage.max_memory) if err != nil { logFailedLocation(row.settings, row.location, err.Error()) d.tickRedispatchCh = time.After(time.Second) returnToWaitingList = append(returnToWaitingList, row) break } else { settings := row.settings if settings.location_type == LOCATION_TYPE_ANY && (settings.developer.String != "") && (settings.developer.String != "wwwrun") && ((now - uint64(settings.created)) <= DEVELOPER_CUSTOM_PATH_TIMEOUT) { hostname = DEVELOPER_DEBUG_HOSTNAME } log.Debugln("Selected ", hostname, " for ", row.location, " (loc_type=", settings.location_type, ")") } nullNow := sql.NullInt64{Valid: true, Int64: int64(now)} queueRow := &RunQueueEntry{ ClassName: d.className, timetable_id: sql.NullInt64{Valid: true, Int64: int64(row.id)}, generation_id: row.generation_id, hostname: hostname, hostname_idx: getHostnameIdx(hostname), JobData: row.JobData, method: row.method, created: nullNow, RunStatus: RUN_STATUS_WAITING, waiting_ts: nullNow, should_init_ts: nullNow, token: row.token, retry_attempt: row.retry_count, settings_id: row.settings_id, settings: row.settings, } newRqList = append(newRqList, queueRow) row.added_to_queue_ts.Valid = true row.added_to_queue_ts.Int64 = int64(now) d.addToAdded(row) } err := db.DoInLazyTransaction(func(tx *db.LazyTrx) error { return addToQueueAndDeleteExpired(tx, newRqList, toDeleteFromWaitingList) }) if err == nil { for _, row := range toDeleteFromWaitingList { d.deletedIds[row.id] = DELETE_IDS_KEEP_GENERATIONS } // all rows can expire by TTL in this loop, so check if it the case and notify job generator about it if len(toDeleteFromWaitingList) > 0 { d.checkZero("redispatch") } if len(newRqList) > 0 { perHost := make(map[string][]*RunQueueEntry) for _, row := range newRqList { perHost[row.hostname] = append(perHost[row.hostname], row) } for hostname, rows := range perHost { notifyAboutNewRQRows(hostname, rows, false) } } return } d.tickRedispatchCh = time.After(time.Second) // restore internal structures back in case of error log.Warnf("Could not add to run queue for class %s and location %s to database: %s", d.className, d.location, err.Error()) for _, rqRow := range newRqList { row, ok := d.addedMap[uint64(rqRow.timetable_id.Int64)] if ok { row.added_to_queue_ts.Valid = false row.added_to_queue_ts.Int64 = 0 row.id = uint64(rqRow.timetable_id.Int64) d.removeFromAdded(row) d.addToWaiting(row) } else { log.Warnf("Internal consistency error: could not find row with timetable id %d", rqRow.timetable_id) } } }
func (server *Server) serveConnection(conn net.Conn) { log.Debugf("accepted connection: %s -> %s", conn.LocalAddr(), conn.RemoteAddr()) defer func() { atomic.AddUint64(&server.Stats.ConnCur, ^uint64(0)) // decrements the value, ref: http://golang.org/pkg/sync/atomic/#AddUint64 log.Debugf("closing connection: %s -> %s", conn.LocalAddr(), conn.RemoteAddr()) if server.onDisconnect != nil { server.onDisconnect(RequestT{ Server: server, Conn: conn, }) } conn.Close() }() atomic.AddUint64(&server.Stats.ConnCur, 1) atomic.AddUint64(&server.Stats.ConnTotal, 1) for { // need these as var here, since goto might jump over their definition below var ( result ResultT request RequestT startTime time.Time pinbaReq pinba.Request requestTime time.Duration ) msgid, msg, bytes_read, status, err := server.Codec.ReadRequest(server.Proto, conn) if err != nil { if status == ConnEOF { // connection was closed gracefully from client side break } if status == ConnOK { // misunderstanding on protobuf level result = server.Proto.ErrorGeneric(err) // FIXME(antoxa): remove this goto, it's annoying to code around goto write_response } // IO error or similar log.Infof("aborting connection: %v", err) break } // FIXME(antoxa): stats will not be incremented when ReadRequest returns err != nil and status == ConnOK atomic.AddUint64(&server.Stats.BytesRead, uint64(bytes_read)) atomic.AddUint64(&server.Stats.Requests, 1) request = RequestT{ Server: server, Conn: conn, RequestId: atomic.AddUint64(&globalRequestId, 1), MessageId: msgid, Message: msg, PinbaReq: &pinbaReq, } if msgid < MaxMsgID { atomic.AddUint64(&server.Stats.RequestsIdStat[msgid], 1) } startTime = time.Now() result = server.Proto.Dispatch(request, server.Handler) requestTime = time.Since(startTime) // log slow request if needed if server.slowRequestTime > 0 && server.slowRequestTime <= requestTime { if msg != nil { requestInfo := func() string { reqName := MapRequestIdToName(server.Proto, msgid) body, err := json.Marshal(msg) if err != nil { return fmt.Sprintf("%s %v", reqName, err) } return fmt.Sprintf("%s %s", reqName, body) }() log.Warnf("slow request (%d ms): %s; addr: %s <- %s", requestTime/time.Millisecond, requestInfo, conn.LocalAddr(), conn.RemoteAddr()) } } server.sendToPinba(&pinbaReq, server.pinbaReqNames[msgid], requestTime) write_response: if result.Action == ACTION_RESULT { writeln, err := server.Codec.WriteResponse(server.Proto, conn, result.Message) if err != nil { // write error: can't recover log.Infof("unrecoverable error while writing: %v", err) break } atomic.AddUint64(&server.Stats.BytesWritten, uint64(writeln)) } if result.Action == ACTION_CLOSE { break } } }
// process finished: // 1. send an error to ev.errorCh, nil if all is ok // 2. restore state upon failure func (d *DispatcherData) processFinished(ev *FinishEvent) { var err error defer func() { ev.errorCh <- err }() row, ok := d.addedMap[ev.timetable_id] if !ok { if rowWaiting, ok := d.waitingMap[ev.timetable_id]; ok { log.Warningf("Got 'finished' event about waiting timetable_id: %d, class=%s, location=%s, row=%+v", ev.timetable_id, d.className, d.location, rowWaiting) err = fmt.Errorf("timetable id is waiting: %d, class=%s, location=%s", ev.timetable_id, d.className, d.location) } else { log.Warningf("Got 'finished' event about unknown timetable_id: %d, class=%s, location=%s", ev.timetable_id, d.className, d.location) err = fmt.Errorf("Unknown timetable id: %d, class=%s, location=%s", ev.timetable_id, d.className, d.location) } return } now := uint64(time.Now().Unix()) // restore everything in case of error rowCopy := *row defer func() { if err != nil { log.Warnf("Restoring old tt row (error: %s) from %+v => %+v", err.Error(), row, rowCopy) *row = rowCopy } else { // TODO: update rusage estimation } }() if !ev.isInitial { if ev.success { row.finished_successfully = 1 } else { row.finished_successfully = 0 } row.finish_count++ if !ev.success { row.retry_count++ } else { row.retry_count = 0 } } row.NextLaunchTs.Valid = false row.NextLaunchTs.Int64 = 0 var ttl uint32 if row.method == METHOD_RUN { ttl = row.settings.ttl } // we should not delete entries that have ttl > 0 and have hit max retries because there is "repeat" field still shouldDelete := d.killRequest != nil || (ttl == 0 && (ev.success || row.retry_count >= row.settings.max_retries)) || (ttl > 0 && now > row.created+uint64(ttl)) cb := func(tx *db.LazyTrx) error { var err error if ev.run_id != 0 { if ev.deleteRq { err = deleteFromRunQueue(tx, []uint64{ev.run_id}, ev.prevStatus) } else { err = errors.New("unexpected deleteRq value") } if err != nil { return err } } if shouldDelete { return deleteAddedFromTimetable(tx, []uint64{ev.timetable_id}) } return logTTFinish(tx, row, ev.havePrevFinishCount, ev.prevFinishCount) } if shouldDelete { if err = db.DoInLazyTransaction(cb); err == nil { if row.id != ev.timetable_id { log.Warnf("Inconsistency of addedMap[%d] = row = %+v", ev.timetable_id, row) row.id = ev.timetable_id } d.removeFromAdded(row) d.deletedIds[ev.timetable_id] = DELETE_IDS_KEEP_GENERATIONS d.checkZero("processFinished") trigger(d.redispatchCh, "redispatch") } else { log.Warnf("could not process finished: %s", err.Error()) } return } next_launch_ts := int64(now) if ev.success && row.added_to_queue_ts.Valid { next_launch_ts = row.added_to_queue_ts.Int64 + row.repeat.Int64 } else if !ev.success { if row.retry_count < 3 { next_launch_ts += int64(row.default_retry) } else { e := row.retry_count - 2 if e >= 3 { e = 3 } next_launch_ts += (1 << e) * int64(row.default_retry) } } row.NextLaunchTs.Valid = true row.NextLaunchTs.Int64 = next_launch_ts row.finished_ts.Valid = false row.finished_ts.Int64 = 0 row.added_to_queue_ts.Valid = false row.added_to_queue_ts.Int64 = 0 if err = db.DoInLazyTransaction(cb); err == nil { d.removeFromAdded(row) d.addToWaiting(row) trigger(d.redispatchCh, "redispatch") } }