func getKeyspaceShards(ctx context.Context, topoServ SrvTopoServer, cell, keyspace string, tabletType pb.TabletType) (string, *pb.SrvKeyspace, []*pb.ShardReference, error) { srvKeyspace, err := topoServ.GetSrvKeyspace(ctx, cell, keyspace) if err != nil { return "", nil, nil, vterrors.NewVitessError( vtrpc.ErrorCode_INTERNAL_ERROR, err, "keyspace %v fetch error: %v", keyspace, err, ) } // check if the keyspace has been redirected for this tabletType. for _, sf := range srvKeyspace.ServedFrom { if sf.TabletType == tabletType { keyspace = sf.Keyspace srvKeyspace, err = topoServ.GetSrvKeyspace(ctx, cell, keyspace) if err != nil { return "", nil, nil, vterrors.NewVitessError( vtrpc.ErrorCode_INTERNAL_ERROR, err, "keyspace %v fetch error: %v", keyspace, err, ) } } } partition := topoproto.SrvKeyspaceGetPartition(srvKeyspace, tabletType) if partition == nil { return "", nil, nil, vterrors.NewVitessError( vtrpc.ErrorCode_INTERNAL_ERROR, err, "No partition found for tabletType %v in keyspace %v", strings.ToLower(tabletType.String()), keyspace, ) } return keyspace, srvKeyspace, partition.ShardReferences, nil }
func getKeyspaceShards(ctx context.Context, topoServ SrvTopoServer, cell, keyspace string, tabletType pb.TabletType) (string, *topo.SrvKeyspace, []topo.ShardReference, error) { srvKeyspace, err := topoServ.GetSrvKeyspace(ctx, cell, keyspace) if err != nil { return "", nil, nil, vterrors.NewVitessError( vtrpc.ErrorCode_INTERNAL_ERROR, err, "keyspace %v fetch error: %v", keyspace, err, ) } // check if the keyspace has been redirected for this tabletType. tt := topo.ProtoToTabletType(tabletType) if servedFrom, ok := srvKeyspace.ServedFrom[tt]; ok { keyspace = servedFrom srvKeyspace, err = topoServ.GetSrvKeyspace(ctx, cell, keyspace) if err != nil { return "", nil, nil, vterrors.NewVitessError( vtrpc.ErrorCode_INTERNAL_ERROR, err, "keyspace %v fetch error: %v", keyspace, err, ) } } partition, ok := srvKeyspace.Partitions[tt] if !ok { return "", nil, nil, vterrors.NewVitessError( vtrpc.ErrorCode_INTERNAL_ERROR, err, "No partition found for tabletType %v in keyspace %v", strings.ToLower(tabletType.String()), keyspace, ) } return keyspace, srvKeyspace, partition.ShardReferences, nil }
// getNewConn creates a new tablet connection with a separate per conn timeout. // It limits the overall timeout to connTimeoutTotal by checking elapsed time after each blocking call. func (sdc *ShardConn) getNewConn(ctx context.Context) (conn tabletconn.TabletConn, endPoint *topodatapb.EndPoint, isTimeout bool, err error) { startTime := time.Now() endPoints, err := sdc.balancer.Get() if err != nil { // Error when getting endpoint return nil, nil, false, err } if len(endPoints) == 0 { // No valid endpoint return nil, nil, false, vterrors.FromError( vtrpcpb.ErrorCode_INTERNAL_ERROR, fmt.Errorf("no valid endpoint"), ) } if time.Now().Sub(startTime) >= sdc.connTimeoutTotal { return nil, nil, true, vterrors.FromError( vtrpcpb.ErrorCode_DEADLINE_EXCEEDED, fmt.Errorf("timeout when getting endpoints"), ) } // Iterate through all endpoints to create a connection perConnTimeout := sdc.getConnTimeoutPerConn(len(endPoints)) allErrors := new(concurrency.AllErrorRecorder) for _, endPoint := range endPoints { perConnStartTime := time.Now() conn, err = tabletconn.GetDialer()(ctx, endPoint, sdc.keyspace, sdc.shard, topodatapb.TabletType_UNKNOWN, perConnTimeout) if err == nil { sdc.connectTimings.Record([]string{sdc.keyspace, sdc.shard, strings.ToLower(sdc.tabletType.String())}, perConnStartTime) sdc.mu.Lock() defer sdc.mu.Unlock() sdc.conn = conn return conn, endPoint, false, nil } // Markdown the endpoint if it failed to connect sdc.balancer.MarkDown(endPoint.Uid, err.Error()) vtErr := vterrors.NewVitessError( // TODO(aaijazi): what about OperationalErrors here? vterrors.RecoverVtErrorCode(err), err, "%v %+v", err, endPoint, ) allErrors.RecordError(vtErr) if time.Now().Sub(startTime) >= sdc.connTimeoutTotal { err = vterrors.FromError( vtrpcpb.ErrorCode_DEADLINE_EXCEEDED, fmt.Errorf("timeout when connecting to %+v", endPoint), ) allErrors.RecordError(err) return nil, nil, true, allErrors.AggrError(AggregateVtGateErrors) } } return nil, nil, false, allErrors.Error() }
func gRPCVtworkerClientFactory(addr string, dialTimeout time.Duration) (vtworkerclient.Client, error) { // create the RPC client opt, err := grpcutils.ClientSecureDialOption(*cert, *key, *ca, *name) if err != nil { return nil, err } cc, err := grpc.Dial(addr, opt, grpc.WithBlock(), grpc.WithTimeout(dialTimeout)) if err != nil { return nil, vterrors.NewVitessError(vtrpcpb.ErrorCode_DEADLINE_EXCEEDED, err, "grpc.Dial() err: %v", err) } c := vtworkerservicepb.NewVtworkerClient(cc) return &gRPCVtworkerClient{ cc: cc, c: c, }, nil }
// NewShardConn creates a new ShardConn. It creates a Balancer using // serv, cell, keyspace, tabletType and retryDelay. retryCount is the max // number of retries before a ShardConn returns an error on an operation. func NewShardConn(ctx context.Context, serv topo.SrvTopoServer, cell, keyspace, shard string, tabletType topodatapb.TabletType, retryDelay time.Duration, retryCount int, connTimeoutTotal, connTimeoutPerConn, connLife time.Duration, tabletConnectTimings *stats.MultiTimings) *ShardConn { getAddresses := func() (*topodatapb.EndPoints, error) { endpoints, _, err := serv.GetEndPoints(ctx, cell, keyspace, shard, tabletType) if err != nil { return nil, vterrors.NewVitessError( vtrpcpb.ErrorCode_INTERNAL_ERROR, err, "endpoints fetch error: %v", err, ) } return endpoints, nil } blc := NewBalancer(getAddresses, retryDelay) var ticker *timer.RandTicker if tabletType != topodatapb.TabletType_MASTER { ticker = timer.NewRandTicker(connLife, connLife/2) } sdc := &ShardConn{ keyspace: keyspace, shard: shard, tabletType: tabletType, retryDelay: retryDelay, retryCount: retryCount, connTimeoutTotal: connTimeoutTotal, connTimeoutPerConn: connTimeoutPerConn, connLife: connLife, balancer: blc, ticker: ticker, consolidator: sync2.NewConsolidator(), connectTimings: tabletConnectTimings, } if ticker != nil { go func() { for range ticker.C { sdc.closeCurrent() } }() } return sdc }
// setAndStartWorker will set the current worker. // We always log to both memory logger (for display on the web) and // console logger (for records / display of command line worker). func (wi *Instance) setAndStartWorker(ctx context.Context, wrk Worker, wr *wrangler.Wrangler) (chan struct{}, error) { wi.currentWorkerMutex.Lock() defer wi.currentWorkerMutex.Unlock() if wi.currentContext != nil { return nil, vterrors.FromError(vtrpcpb.ErrorCode_TRANSIENT_ERROR, fmt.Errorf("A worker job is already in progress: %v", wi.currentWorker.StatusAsText())) } if wi.currentWorker != nil { // During the grace period, we answer with a retryable error. const gracePeriod = 1 * time.Minute gracePeriodEnd := time.Now().Add(gracePeriod) if wi.lastRunStopTime.Before(gracePeriodEnd) { return nil, vterrors.FromError(vtrpcpb.ErrorCode_TRANSIENT_ERROR, fmt.Errorf("A worker job was recently stopped (%f seconds ago): %v", time.Now().Sub(wi.lastRunStopTime).Seconds(), wi.currentWorker)) } // QUERY_NOT_SERVED = FailedPrecondition => manual resolution required. return nil, vterrors.FromError(vtrpcpb.ErrorCode_QUERY_NOT_SERVED, fmt.Errorf("The worker job was stopped %.1f minutes ago, but not reset. You have to reset it manually. Job: %v", time.Now().Sub(wi.lastRunStopTime).Minutes(), wi.currentWorker)) } wi.currentWorker = wrk wi.currentMemoryLogger = logutil.NewMemoryLogger() wi.currentContext, wi.currentCancelFunc = context.WithCancel(ctx) wi.lastRunError = nil wi.lastRunStopTime = time.Unix(0, 0) done := make(chan struct{}) wranglerLogger := wr.Logger() if wr == wi.wr { // If it's the default wrangler, do not reuse its logger because it may have been set before. // Resuing it would result into an endless recursion. wranglerLogger = logutil.NewConsoleLogger() } wr.SetLogger(logutil.NewTeeLogger(wi.currentMemoryLogger, wranglerLogger)) // one go function runs the worker, changes state when done go func() { log.Infof("Starting worker...") var err error // Catch all panics and always save the execution state at the end. defer func() { // The recovery code is a copy of servenv.HandlePanic(). if x := recover(); x != nil { log.Errorf("uncaught vtworker panic: %v\n%s", x, tb.Stack(4)) err = fmt.Errorf("uncaught vtworker panic: %v", x) } wi.currentWorkerMutex.Lock() wi.currentContext = nil wi.currentCancelFunc = nil wi.lastRunError = err wi.lastRunStopTime = time.Now() wi.currentWorkerMutex.Unlock() close(done) }() // run will take a long time err = wrk.Run(wi.currentContext) // If the context was canceled, include the respective error code. select { case <-wi.currentContext.Done(): // Context is done i.e. probably canceled. if wi.currentContext.Err() == context.Canceled { err = vterrors.NewVitessError(vtrpcpb.ErrorCode_CANCELLED, err, "vtworker command was canceled: %v", err) } default: } }() return done, nil }