func handleError(err *error, logStats *SQLQueryStats, queryServiceStats *QueryServiceStats) { if x := recover(); x != nil { terr, ok := x.(*TabletError) if !ok { log.Errorf("Uncaught panic:\n%v\n%s", x, tb.Stack(4)) *err = NewTabletError(ErrFail, "%v: uncaught panic", x) queryServiceStats.InternalErrors.Add("Panic", 1) return } *err = terr terr.RecordStats(queryServiceStats) if terr.ErrorType == ErrRetry { // Retry errors are too spammy return } if terr.ErrorType == ErrTxPoolFull { logTxPoolFull.Errorf("%v", terr) } else { log.Errorf("%v", terr) } } if logStats != nil { logStats.Error = *err logStats.Send() } }
// HandlePanic should be called using 'defer' in the RPC code that executes the command. func HandlePanic(component string, err *error) { if x := recover(); x != nil { // gRPC 0.13 chokes when you return a streaming error that contains newlines. *err = fmt.Errorf("uncaught %v panic: %v, %s", component, x, strings.Replace(string(tb.Stack(4)), "\n", ";", -1)) } }
func (rci *RowcacheInvalidator) run() { for { // We wrap this code in a func so we can catch all panics. // If an error is returned, we log it, wait 1 second, and retry. // This loop can only be stopped by calling Close. err := func() (inner error) { defer func() { if x := recover(); x != nil { inner = fmt.Errorf("%v: uncaught panic:\n%s", x, tb.Stack(4)) } }() rp, err := rci.mysqld.BinlogInfo(rci.GroupId.Get()) if err != nil { return err } return rci.evs.Stream(rp.MasterLogFile, int64(rp.MasterLogPosition), func(reply *blproto.StreamEvent) error { rci.processEvent(reply) return nil }) }() if err == nil { break } log.Errorf("binlog.ServeUpdateStream returned err '%v', retrying in 1 second.", err.Error()) internalErrors.Add("Invalidation", 1) time.Sleep(1 * time.Second) } log.Infof("Rowcache invalidator stopped") }
func handlePanic(err *error) { if x := recover(); x != nil { log.Errorf("Uncaught panic:\n%v\n%s", x, tb.Stack(4)) *err = fmt.Errorf("uncaught panic: %v", x) internalErrors.Add("Panic", 1) } }
// handleExecError handles panics during query execution and sets // the supplied error return value. func handleExecError(query *proto.Query, err *error, logStats *SQLQueryStats) { if logStats != nil { logStats.Send() } if x := recover(); x != nil { terr, ok := x.(*TabletError) if !ok { log.Errorf("Uncaught panic for %v:\n%v\n%s", query, x, tb.Stack(4)) *err = NewTabletError(FAIL, "%v: uncaught panic for %v", x, query) internalErrors.Add("Panic", 1) return } *err = terr terr.RecordStats() // suppress these errors in logs if terr.ErrorType == RETRY || terr.ErrorType == TX_POOL_FULL || terr.SqlError == mysql.DUP_ENTRY { return } if terr.ErrorType == FATAL { log.Errorf("%v: %v", terr, query) } else { log.Warningf("%v: %v", terr, query) } } }
// handleExecError handles panics during query execution and sets // the supplied error return value. func (sq *SqlQuery) handleExecError(query *proto.Query, err *error, logStats *SQLQueryStats) { if x := recover(); x != nil { terr, ok := x.(*TabletError) if !ok { log.Errorf("Uncaught panic for %v:\n%v\n%s", query, x, tb.Stack(4)) *err = NewTabletError(ErrFail, "%v: uncaught panic for %v", x, query) sq.qe.queryServiceStats.InternalErrors.Add("Panic", 1) return } if sq.config.TerseErrors && terr.SqlError != 0 { *err = fmt.Errorf("%s(errno %d) during query: %s", terr.Prefix(), terr.SqlError, query.Sql) } else { *err = terr } terr.RecordStats(sq.qe.queryServiceStats) // suppress these errors in logs if terr.ErrorType == ErrRetry || terr.ErrorType == ErrTxPoolFull || terr.SqlError == mysql.ErrDupEntry { return } if terr.ErrorType == ErrFatal { log.Errorf("%v: %v", terr, query) } else { log.Warningf("%v: %v", terr, query) } } if logStats != nil { logStats.Error = *err logStats.Send() } }
// HandlePanic recovers from panics, and logs / increment counters func (vtg *VTGate) HandlePanic(err *error) { if x := recover(); x != nil { log.Errorf("Uncaught panic:\n%v\n%s", x, tb.Stack(4)) *err = fmt.Errorf("uncaught panic: %v, vtgate: %v", x, servenv.ListeningURL.String()) internalErrors.Add("Panic", 1) } }
func handleError(err *error, logStats *SQLQueryStats, queryServiceStats *QueryServiceStats) { if x := recover(); x != nil { terr, ok := x.(*TabletError) if !ok { log.Errorf("Uncaught panic:\n%v\n%s", x, tb.Stack(4)) *err = NewTabletError(ErrFail, vtrpc.ErrorCode_UNKNOWN_ERROR, "%v: uncaught panic", x) queryServiceStats.InternalErrors.Add("Panic", 1) return } *err = terr terr.RecordStats(queryServiceStats) switch terr.ErrorType { case ErrRetry: // Retry errors are too spammy return case ErrTxPoolFull: logTxPoolFull.Errorf("%v", terr) default: switch terr.SqlError { // MySQL deadlock errors are (usually) due to client behavior, not server // behavior, and therefore logged at the INFO level. case mysql.ErrLockWaitTimeout, mysql.ErrLockDeadlock: log.Infof("%v", terr) default: log.Errorf("%v", terr) } } } if logStats != nil { logStats.Error = *err logStats.Send() } }
func (rci *RowcacheInvalidator) run(ctx *sync2.ServiceContext) error { for { evs := binlog.NewEventStreamer(rci.dbname, rci.mysqld, rci.Position(), rci.processEvent) // We wrap this code in a func so we can catch all panics. // If an error is returned, we log it, wait 1 second, and retry. // This loop can only be stopped by calling Close. err := func() (inner error) { defer func() { if x := recover(); x != nil { inner = fmt.Errorf("%v: uncaught panic:\n%s", x, tb.Stack(4)) } }() return evs.Stream(ctx) }() if err == nil || !ctx.IsRunning() { break } if IsConnErr(err) { rci.checker.CheckMySQL() } log.Errorf("binlog.ServeUpdateStream returned err '%v', retrying in 1 second.", err.Error()) rci.qe.queryServiceStats.InternalErrors.Add("Invalidation", 1) time.Sleep(1 * time.Second) } log.Infof("Rowcache invalidator stopped") return nil }
func logError() { if x := recover(); x != nil { terr, ok := x.(*TabletError) if !ok { log.Errorf("Uncaught panic:\n%v\n%s", x, tb.Stack(4)) internalErrors.Add("Panic", 1) return } log.Errorf("%v", terr) } }
func handleInvalidationError(event *blproto.StreamEvent) { if x := recover(); x != nil { terr, ok := x.(*TabletError) if !ok { log.Errorf("Uncaught panic for %+v:\n%v\n%s", event, x, tb.Stack(4)) internalErrors.Add("Panic", 1) return } log.Errorf("%v: %+v", terr, event) internalErrors.Add("Invalidation", 1) } }
func handleInvalidationError(request interface{}) { if x := recover(); x != nil { terr, ok := x.(*TabletError) if !ok { log.Errorf("Uncaught panic for %v:\n%v\n%s", request, x, tb.Stack(4)) internalErrors.Add("Panic", 1) return } log.Errorf("%s: %v", terr.Message, request) internalErrors.Add("Invalidation", 1) } }
func (rci *RowcacheInvalidator) handleInvalidationError(event *binlogdatapb.StreamEvent) { if x := recover(); x != nil { terr, ok := x.(*TabletError) if !ok { log.Errorf("Uncaught panic for %+v:\n%v\n%s", event, x, tb.Stack(4)) rci.qe.queryServiceStats.InternalErrors.Add("Panic", 1) return } log.Errorf("%v: %+v", terr, event) rci.qe.queryServiceStats.InternalErrors.Add("Invalidation", 1) } }
func (tsv *TabletServer) handleExecErrorNoPanic(query *proto.Query, err interface{}, logStats *LogStats) error { var terr *TabletError defer func() { if logStats != nil { logStats.Error = terr } }() terr, ok := err.(*TabletError) if !ok { log.Errorf("Uncaught panic for %v:\n%v\n%s", query, err, tb.Stack(4)) tsv.qe.queryServiceStats.InternalErrors.Add("Panic", 1) terr = NewTabletError(ErrFail, vtrpc.ErrorCode_UNKNOWN_ERROR, "%v: uncaught panic for %v", err, query) return terr } var myError error if tsv.config.TerseErrors && terr.SQLError != 0 && len(query.BindVariables) != 0 { myError = &TabletError{ ErrorType: terr.ErrorType, SQLError: terr.SQLError, ErrorCode: terr.ErrorCode, Message: fmt.Sprintf("(errno %d) during query: %s", terr.SQLError, query.Sql), } } else { myError = terr } terr.RecordStats(tsv.qe.queryServiceStats) logMethod := log.Warningf // Suppress or demote some errors in logs switch terr.ErrorType { case ErrRetry, ErrTxPoolFull: return myError case ErrFatal: logMethod = log.Errorf } // We want to suppress/demote some MySQL error codes (regardless of the ErrorType) switch terr.SQLError { case mysql.ErrDupEntry: return myError case mysql.ErrLockWaitTimeout, mysql.ErrLockDeadlock, mysql.ErrDataTooLong, mysql.ErrDataOutOfRange, mysql.ErrBadNullError: logMethod = log.Infof case 0: if strings.Contains(terr.Error(), "Row count exceeded") { logMethod = log.Infof } } logMethod("%v: %v", terr, query) return myError }
func logError(queryServiceStats *QueryServiceStats) { if x := recover(); x != nil { terr, ok := x.(*TabletError) if !ok { log.Errorf("Uncaught panic:\n%v\n%s", x, tb.Stack(4)) queryServiceStats.InternalErrors.Add("Panic", 1) return } if terr.ErrorCode == vtrpcpb.ErrorCode_RESOURCE_EXHAUSTED { logTxPoolFull.Errorf("%v", terr) } else { log.Errorf("%v", terr) } } }
func handleExecError(query *proto.Query, err *error, logStats *sqlQueryStats) { if logStats != nil { logStats.Send() } if x := recover(); x != nil { terr, ok := x.(*TabletError) if !ok { log.Errorf("Uncaught panic for %v:\n%v\n%s", query, x, tb.Stack(4)) *err = NewTabletError(FAIL, "%v: uncaught panic for %v", x, query) errorStats.Add("Panic", 1) return } *err = terr terr.RecordStats() if terr.ErrorType == RETRY || terr.SqlError == DUPLICATE_KEY { // suppress these errors in logs return } log.Errorf("%s: %v", terr.Message, query) } }
func handleError(err *error, logStats *sqlQueryStats) { if logStats != nil { logStats.Send() } if x := recover(); x != nil { terr, ok := x.(*TabletError) if !ok { log.Errorf("Uncaught panic:\n%v\n%s", x, tb.Stack(4)) *err = NewTabletError(FAIL, "%v: uncaught panic", x) internalErrors.Add("Panic", 1) return } *err = terr terr.RecordStats() if terr.ErrorType == RETRY { // Retry errors are too spammy return } log.Errorf("%s", terr.Message) } }
func handleError(err *error, logStats *LogStats, queryServiceStats *QueryServiceStats) { var terr *TabletError defer func() { if logStats != nil { logStats.Error = terr logStats.Send() } }() if x := recover(); x != nil { terr, ok := x.(*TabletError) if !ok { log.Errorf("Uncaught panic:\n%v\n%s", x, tb.Stack(4)) terr = NewTabletError(vtrpcpb.ErrorCode_UNKNOWN_ERROR, "%v: uncaught panic", x) *err = terr queryServiceStats.InternalErrors.Add("Panic", 1) return } *err = terr terr.RecordStats(queryServiceStats) switch terr.ErrorCode { case vtrpcpb.ErrorCode_QUERY_NOT_SERVED: // Retry errors are too spammy return case vtrpcpb.ErrorCode_RESOURCE_EXHAUSTED: logTxPoolFull.Errorf("%v", terr) default: switch terr.SQLError { // MySQL deadlock errors are (usually) due to client behavior, not server // behavior, and therefore logged at the INFO level. case mysql.ErrLockWaitTimeout, mysql.ErrLockDeadlock, mysql.ErrDataTooLong, mysql.ErrDataOutOfRange, mysql.ErrBadNullError: log.Infof("%v", terr) default: log.Errorf("%v", terr) } } } }
func (sq *SqlQuery) handleExecErrorNoPanic(query *proto.Query, err interface{}, logStats *SQLQueryStats) error { terr, ok := err.(*TabletError) if !ok { log.Errorf("Uncaught panic for %v:\n%v\n%s", query, err, tb.Stack(4)) sq.qe.queryServiceStats.InternalErrors.Add("Panic", 1) return NewTabletError(ErrFail, "%v: uncaught panic for %v", err, query) } var myError error if sq.config.TerseErrors && terr.SqlError != 0 && len(query.BindVariables) != 0 { myError = fmt.Errorf("%s(errno %d) during query: %s", terr.Prefix(), terr.SqlError, query.Sql) } else { myError = terr } terr.RecordStats(sq.qe.queryServiceStats) logMethod := log.Warningf // Suppress or demote some errors in logs switch terr.ErrorType { case ErrRetry, ErrTxPoolFull: return myError case ErrFatal: logMethod = log.Errorf } // We want to suppress/demote some MySQL error codes (regardless of the ErrorType) switch terr.SqlError { case mysql.ErrDupEntry: return myError case mysql.ErrLockWaitTimeout, mysql.ErrLockDeadlock, mysql.ErrDataTooLong: logMethod = log.Infof case 0: if strings.Contains(terr.Error(), "Row count exceeded") { logMethod = log.Infof } } logMethod("%v: %v", terr, query) return myError }
func (sq *SqlQuery) handleExecErrorNoPanic(query *proto.Query, err interface{}, logStats *SQLQueryStats) error { terr, ok := err.(*TabletError) if !ok { log.Errorf("Uncaught panic for %v:\n%v\n%s", query, err, tb.Stack(4)) sq.qe.queryServiceStats.InternalErrors.Add("Panic", 1) return NewTabletError(ErrFail, "%v: uncaught panic for %v", err, query) } var myError error if sq.config.TerseErrors && terr.SqlError != 0 { myError = fmt.Errorf("%s(errno %d) during query: %s", terr.Prefix(), terr.SqlError, query.Sql) } else { myError = terr } terr.RecordStats(sq.qe.queryServiceStats) // suppress these errors in logs if terr.ErrorType == ErrRetry || terr.ErrorType == ErrTxPoolFull || terr.SqlError == mysql.ErrDupEntry { return myError } if terr.ErrorType == ErrFatal { log.Errorf("%v: %v", terr, query) } log.Warningf("%v: %v", terr, query) return myError }
// Iteration is a single iteration for the player: get the current status, // try to play, and plays until interrupted, or until an error occurs. func (bpc *BinlogPlayerController) Iteration() (err error) { defer func() { if x := recover(); x != nil { log.Errorf("%v: caught panic: %v\n%s", bpc, x, tb.Stack(4)) err = fmt.Errorf("panic: %v", x) } }() // Apply any special settings necessary for playback of binlogs. // We do it on every iteration to be sure, in case MySQL was restarted. if err := bpc.mysqld.EnableBinlogPlayback(); err != nil { // We failed to apply the required settings, so we shouldn't keep going. return err } // create the db connection, connect it vtClient := bpc.vtClientFactory() if err := vtClient.Connect(); err != nil { return fmt.Errorf("can't connect to database: %v", err) } defer vtClient.Close() // Read the start position startPosition, flags, err := binlogplayer.ReadStartPosition(vtClient, bpc.sourceShard.Uid) if err != nil { return fmt.Errorf("can't read startPosition: %v", err) } // if we shouldn't start, we just error out and try again later if strings.Index(flags, binlogplayer.BlpFlagDontStart) != -1 { return fmt.Errorf("not starting because flag '%v' is set", binlogplayer.BlpFlagDontStart) } // wait for the endpoint set (usefull for the first run at least, fast for next runs) if err := discovery.WaitForEndPoints(bpc.healthCheck, bpc.cell, bpc.sourceShard.Keyspace, bpc.sourceShard.Shard, []topodatapb.TabletType{topodatapb.TabletType_REPLICA}); err != nil { return fmt.Errorf("error waiting for endpoints for %v %v %v: %v", bpc.cell, bpc.sourceShard.String(), topodatapb.TabletType_REPLICA, err) } // Find the server list from the health check addrs := bpc.healthCheck.GetEndPointStatsFromTarget(bpc.sourceShard.Keyspace, bpc.sourceShard.Shard, topodatapb.TabletType_REPLICA) if len(addrs) == 0 { return fmt.Errorf("can't find any source tablet for %v %v %v", bpc.cell, bpc.sourceShard.String(), topodatapb.TabletType_REPLICA) } newServerIndex := rand.Intn(len(addrs)) endPoint := addrs[newServerIndex].EndPoint // save our current server bpc.playerMutex.Lock() bpc.sourceTablet = &topodatapb.TabletAlias{ Cell: bpc.cell, Uid: endPoint.Uid, } bpc.lastError = nil bpc.playerMutex.Unlock() // check which kind of replication we're doing, tables or keyrange if len(bpc.sourceShard.Tables) > 0 { // tables, first resolve wildcards tables, err := mysqlctl.ResolveTables(bpc.mysqld, bpc.dbName, bpc.sourceShard.Tables) if err != nil { return fmt.Errorf("failed to resolve table names: %v", err) } // tables, just get them player, err := binlogplayer.NewBinlogPlayerTables(vtClient, endPoint, tables, bpc.sourceShard.Uid, startPosition, bpc.stopPosition, bpc.binlogPlayerStats) if err != nil { return fmt.Errorf("NewBinlogPlayerTables failed: %v", err) } return player.ApplyBinlogEvents(bpc.ctx) } // the data we have to replicate is the intersection of the // source keyrange and our keyrange overlap, err := key.KeyRangesOverlap(bpc.sourceShard.KeyRange, bpc.keyRange) if err != nil { return fmt.Errorf("Source shard %v doesn't overlap destination shard %v", bpc.sourceShard.KeyRange, bpc.keyRange) } player, err := binlogplayer.NewBinlogPlayerKeyRange(vtClient, endPoint, overlap, bpc.sourceShard.Uid, startPosition, bpc.stopPosition, bpc.binlogPlayerStats) if err != nil { return fmt.Errorf("NewBinlogPlayerKeyRange failed: %v", err) } return player.ApplyBinlogEvents(bpc.ctx) }
// setAndStartWorker will set the current worker. // We always log to both memory logger (for display on the web) and // console logger (for records / display of command line worker). func (wi *Instance) setAndStartWorker(wrk Worker, wr *wrangler.Wrangler) (chan struct{}, error) { wi.currentWorkerMutex.Lock() defer wi.currentWorkerMutex.Unlock() if wi.currentContext != nil { return nil, vterrors.FromError(vtrpcpb.ErrorCode_TRANSIENT_ERROR, fmt.Errorf("A worker job is already in progress: %v", wi.currentWorker)) } if wi.currentWorker != nil { // During the grace period, we answer with a retryable error. const gracePeriod = 1 * time.Minute gracePeriodEnd := time.Now().Add(gracePeriod) if wi.lastRunStopTime.Before(gracePeriodEnd) { return nil, vterrors.FromError(vtrpcpb.ErrorCode_TRANSIENT_ERROR, fmt.Errorf("A worker job was recently stopped (%f seconds ago): %v", time.Now().Sub(wi.lastRunStopTime).Seconds(), wi.currentWorker)) } // QUERY_NOT_SERVED = FailedPrecondition => manual resolution required. return nil, vterrors.FromError(vtrpcpb.ErrorCode_QUERY_NOT_SERVED, fmt.Errorf("The worker job was stopped %.1f minutes ago, but not reset. You have to reset it manually. Job: %v", time.Now().Sub(wi.lastRunStopTime).Minutes(), wi.currentWorker)) } wi.currentWorker = wrk wi.currentMemoryLogger = logutil.NewMemoryLogger() wi.currentContext, wi.currentCancelFunc = context.WithCancel(wi.backgroundContext) wi.lastRunError = nil wi.lastRunStopTime = time.Unix(0, 0) done := make(chan struct{}) wranglerLogger := wr.Logger() if wr == wi.wr { // If it's the default wrangler, do not reuse its logger because it may have been set before. // Resuing it would result into an endless recursion. wranglerLogger = logutil.NewConsoleLogger() } wr.SetLogger(logutil.NewTeeLogger(wi.currentMemoryLogger, wranglerLogger)) // one go function runs the worker, changes state when done go func() { log.Infof("Starting worker...") var err error // Catch all panics and always save the execution state at the end. defer func() { // The recovery code is a copy of servenv.HandlePanic(). if x := recover(); x != nil { log.Errorf("uncaught vtworker panic: %v\n%s", x, tb.Stack(4)) err = fmt.Errorf("uncaught vtworker panic: %v", x) } wi.currentWorkerMutex.Lock() wi.currentContext = nil wi.currentCancelFunc = nil wi.lastRunError = err wi.lastRunStopTime = time.Now() wi.currentWorkerMutex.Unlock() close(done) }() // run will take a long time err = wrk.Run(wi.currentContext) }() return done, nil }
// HandlePanic is part of the UpdateStream interface func (updateStream *UpdateStreamImpl) HandlePanic(err *error) { if x := recover(); x != nil { log.Errorf("Uncaught panic:\n%v\n%s", x, tb.Stack(4)) *err = fmt.Errorf("uncaught panic: %v", x) } }
func logError() { if x := recover(); x != nil { log.Errorf("%s at\n%s", x.(error).Error(), tb.Stack(4)) } }
func (c *errorClient) HandlePanic(err *error) { if x := recover(); x != nil { log.Errorf("Uncaught panic:\n%v\n%s", x, tb.Stack(4)) *err = fmt.Errorf("uncaught panic: %v", x) } }
func (tsv *TabletServer) handleExecErrorNoPanic(sql string, bindVariables map[string]interface{}, err interface{}, logStats *LogStats) error { var terr *TabletError defer func() { if logStats != nil { logStats.Error = terr } }() terr, ok := err.(*TabletError) if !ok { log.Errorf("Uncaught panic for %v:\n%v\n%s", querytypes.QueryAsString(sql, bindVariables), err, tb.Stack(4)) tsv.qe.queryServiceStats.InternalErrors.Add("Panic", 1) terr = NewTabletError(vtrpcpb.ErrorCode_UNKNOWN_ERROR, "%v: uncaught panic for %v", err, querytypes.QueryAsString(sql, bindVariables)) return terr } var myError error if tsv.config.TerseErrors && terr.SQLError != 0 && len(bindVariables) != 0 { myError = &TabletError{ SQLError: terr.SQLError, SQLState: terr.SQLState, ErrorCode: terr.ErrorCode, Message: fmt.Sprintf("(errno %d) (sqlstate %s) during query: %s", terr.SQLError, terr.SQLState, sql), } } else { myError = terr } terr.RecordStats(tsv.qe.queryServiceStats) logMethod := log.Warningf // Suppress or demote some errors in logs switch terr.ErrorCode { case vtrpcpb.ErrorCode_QUERY_NOT_SERVED, vtrpcpb.ErrorCode_RESOURCE_EXHAUSTED: return myError case vtrpcpb.ErrorCode_INTERNAL_ERROR: logMethod = log.Errorf case vtrpcpb.ErrorCode_NOT_IN_TX: // keep as warning default: // default is when we think the query itself is // problematic. This doesn't indicate a system or // component wide degradation, so we log to INFO. logMethod = log.Infof } // We want to suppress/demote some MySQL error codes // (regardless of the ErrorType) switch terr.SQLError { case mysql.ErrDupEntry: return myError case mysql.ErrLockWaitTimeout, mysql.ErrLockDeadlock, mysql.ErrDataTooLong, mysql.ErrDataOutOfRange, mysql.ErrBadNullError: logMethod = log.Infof case 0: if strings.Contains(terr.Error(), "Row count exceeded") { logMethod = log.Infof } } logMethod("%v: %v", terr, querytypes.QueryAsString(sql, bindVariables)) return myError }
// rpcWrapper handles all the logic for rpc calls. func (agent *ActionAgent) rpcWrapper(ctx context.Context, name string, args, reply interface{}, verbose bool, f func() error, lock, runAfterAction bool) (err error) { defer func() { if x := recover(); x != nil { log.Errorf("TabletManager.%v(%v) on %v panic: %v\n%s", name, args, topo.TabletAliasString(agent.TabletAlias), x, tb.Stack(4)) err = fmt.Errorf("caught panic during %v: %v", name, x) } }() from := "" ci, ok := callinfo.FromContext(ctx) if ok { from = ci.Text() } if lock { beforeLock := time.Now() agent.actionMutex.Lock() defer agent.actionMutex.Unlock() if time.Now().Sub(beforeLock) > rpcTimeout { return fmt.Errorf("server timeout for " + name) } } if err = f(); err != nil { log.Warningf("TabletManager.%v(%v)(on %v from %v) error: %v", name, args, topo.TabletAliasString(agent.TabletAlias), from, err.Error()) return fmt.Errorf("TabletManager.%v on %v error: %v", name, topo.TabletAliasString(agent.TabletAlias), err) } if verbose { log.Infof("TabletManager.%v(%v)(on %v from %v): %#v", name, args, topo.TabletAliasString(agent.TabletAlias), from, reply) } if runAfterAction { err = agent.refreshTablet(ctx, "RPC("+name+")") } return }
// Iteration is a single iteration for the player: get the current status, // try to play, and plays until interrupted, or until an error occurs. func (bpc *BinlogPlayerController) Iteration() (err error) { defer func() { if x := recover(); x != nil { log.Errorf("%v: caught panic: %v\n%s", bpc, x, tb.Stack(4)) err = fmt.Errorf("panic: %v", x) } }() // Apply any special settings necessary for playback of binlogs. // We do it on every iteration to be sure, in case MySQL was restarted. if err := bpc.mysqld.EnableBinlogPlayback(); err != nil { // We failed to apply the required settings, so we shouldn't keep going. return err } // create the db connection, connect it vtClient := bpc.vtClientFactory() if err := vtClient.Connect(); err != nil { return fmt.Errorf("can't connect to database: %v", err) } defer vtClient.Close() // Read the start position startPosition, flags, err := binlogplayer.ReadStartPosition(vtClient, bpc.sourceShard.Uid) if err != nil { return fmt.Errorf("can't read startPosition: %v", err) } // if we shouldn't start, we just error out and try again later if strings.Index(flags, binlogplayer.BlpFlagDontStart) != -1 { return fmt.Errorf("not starting because flag '%v' is set", binlogplayer.BlpFlagDontStart) } // Find the server list for the source shard in our cell addrs, _, err := bpc.ts.GetEndPoints(bpc.ctx, bpc.cell, bpc.sourceShard.Keyspace, bpc.sourceShard.Shard, pb.TabletType_REPLICA) if err != nil { // If this calls fails because the context was canceled, // we need to return nil. select { case <-bpc.ctx.Done(): if bpc.ctx.Err() == context.Canceled { return nil } default: } return fmt.Errorf("can't find any source tablet for %v %v %v: %v", bpc.cell, bpc.sourceShard.String(), topo.TYPE_REPLICA, err) } if len(addrs.Entries) == 0 { return fmt.Errorf("empty source tablet list for %v %v %v", bpc.cell, bpc.sourceShard.String(), topo.TYPE_REPLICA) } newServerIndex := rand.Intn(len(addrs.Entries)) endPoint := addrs.Entries[newServerIndex] // save our current server bpc.playerMutex.Lock() bpc.sourceTablet = &pb.TabletAlias{ Cell: bpc.cell, Uid: addrs.Entries[newServerIndex].Uid, } bpc.lastError = nil bpc.playerMutex.Unlock() // check which kind of replication we're doing, tables or keyrange if len(bpc.sourceShard.Tables) > 0 { // tables, first resolve wildcards tables, err := mysqlctl.ResolveTables(bpc.mysqld, bpc.dbName, bpc.sourceShard.Tables) if err != nil { return fmt.Errorf("failed to resolve table names: %v", err) } // tables, just get them player := binlogplayer.NewBinlogPlayerTables(vtClient, endPoint, tables, startPosition, bpc.stopPosition, bpc.binlogPlayerStats) return player.ApplyBinlogEvents(bpc.ctx) } // the data we have to replicate is the intersection of the // source keyrange and our keyrange overlap, err := key.KeyRangesOverlap(bpc.sourceShard.KeyRange, bpc.keyRange) if err != nil { return fmt.Errorf("Source shard %v doesn't overlap destination shard %v", bpc.sourceShard.KeyRange, bpc.keyRange) } player := binlogplayer.NewBinlogPlayerKeyRange(vtClient, endPoint, bpc.keyspaceIDType, overlap, startPosition, bpc.stopPosition, bpc.binlogPlayerStats) return player.ApplyBinlogEvents(bpc.ctx) }
// HandlePanic should be called using 'defer' in the RPC code that executes the command. func HandlePanic(component string, err *error) { if x := recover(); x != nil { *err = fmt.Errorf("uncaught %v panic: %v\n%s", component, x, tb.Stack(4)) } }
// Iteration is a single iteration for the player: get the current status, // try to play, and plays until interrupted, or until an error occurs. func (bpc *BinlogPlayerController) Iteration() (err error) { defer func() { if x := recover(); x != nil { log.Errorf("%v: caught panic: %v\n%s", bpc, x, tb.Stack(4)) err = fmt.Errorf("panic: %v", x) } }() // Check if the context is still good. select { case <-bpc.ctx.Done(): if bpc.ctx.Err() == context.Canceled { // We were stopped. Break out of Loop(). return nil } return fmt.Errorf("giving up since the context is done: %v", bpc.ctx.Err()) default: } // Apply any special settings necessary for playback of binlogs. // We do it on every iteration to be sure, in case MySQL was restarted. if err := bpc.mysqld.EnableBinlogPlayback(); err != nil { // We failed to apply the required settings, so we shouldn't keep going. return err } // create the db connection, connect it vtClient := bpc.vtClientFactory() if err := vtClient.Connect(); err != nil { return fmt.Errorf("can't connect to database: %v", err) } defer vtClient.Close() // Read the start position startPosition, flags, err := binlogplayer.ReadStartPosition(vtClient, bpc.sourceShard.Uid) if err != nil { return fmt.Errorf("can't read startPosition: %v", err) } // if we shouldn't start, we just error out and try again later if strings.Index(flags, binlogplayer.BlpFlagDontStart) != -1 { return fmt.Errorf("not starting because flag '%v' is set", binlogplayer.BlpFlagDontStart) } // wait for the tablet set (usefull for the first run at least, fast for next runs) if err := bpc.tabletStatsCache.WaitForTablets(bpc.ctx, bpc.cell, bpc.sourceShard.Keyspace, bpc.sourceShard.Shard, []topodatapb.TabletType{topodatapb.TabletType_REPLICA}); err != nil { return fmt.Errorf("error waiting for tablets for %v %v %v: %v", bpc.cell, bpc.sourceShard.String(), topodatapb.TabletType_REPLICA, err) } // Find the server list from the health check. // Note: We cannot use tsc.GetHealthyTabletStats() here because it does // not return non-serving tablets. We must include non-serving tablets because // REPLICA source tablets may not be serving anymore because their traffic was // already migrated to the destination shards. addrs := discovery.RemoveUnhealthyTablets(bpc.tabletStatsCache.GetTabletStats(bpc.sourceShard.Keyspace, bpc.sourceShard.Shard, topodatapb.TabletType_REPLICA)) if len(addrs) == 0 { return fmt.Errorf("can't find any healthy source tablet for %v %v %v", bpc.cell, bpc.sourceShard.String(), topodatapb.TabletType_REPLICA) } newServerIndex := rand.Intn(len(addrs)) tablet := addrs[newServerIndex].Tablet // save our current server bpc.playerMutex.Lock() bpc.sourceTablet = tablet.Alias bpc.lastError = nil bpc.playerMutex.Unlock() // check which kind of replication we're doing, tables or keyrange if len(bpc.sourceShard.Tables) > 0 { // tables, first resolve wildcards tables, err := mysqlctl.ResolveTables(bpc.mysqld, bpc.dbName, bpc.sourceShard.Tables) if err != nil { return fmt.Errorf("failed to resolve table names: %v", err) } // tables, just get them player, err := binlogplayer.NewBinlogPlayerTables(vtClient, tablet, tables, bpc.sourceShard.Uid, startPosition, bpc.stopPosition, bpc.binlogPlayerStats) if err != nil { return fmt.Errorf("NewBinlogPlayerTables failed: %v", err) } return player.ApplyBinlogEvents(bpc.ctx) } // the data we have to replicate is the intersection of the // source keyrange and our keyrange overlap, err := key.KeyRangesOverlap(bpc.sourceShard.KeyRange, bpc.keyRange) if err != nil { return fmt.Errorf("Source shard %v doesn't overlap destination shard %v", bpc.sourceShard.KeyRange, bpc.keyRange) } player, err := binlogplayer.NewBinlogPlayerKeyRange(vtClient, tablet, overlap, bpc.sourceShard.Uid, startPosition, bpc.stopPosition, bpc.binlogPlayerStats) if err != nil { return fmt.Errorf("NewBinlogPlayerKeyRange failed: %v", err) } return player.ApplyBinlogEvents(bpc.ctx) }