Пример #1
0
func getKeyspaceShards(ctx context.Context, topoServ SrvTopoServer, cell, keyspace string, tabletType pb.TabletType) (string, *pb.SrvKeyspace, []*pb.ShardReference, error) {
	srvKeyspace, err := topoServ.GetSrvKeyspace(ctx, cell, keyspace)
	if err != nil {
		return "", nil, nil, vterrors.NewVitessError(
			vtrpc.ErrorCode_INTERNAL_ERROR, err,
			"keyspace %v fetch error: %v", keyspace, err,
		)
	}

	// check if the keyspace has been redirected for this tabletType.
	for _, sf := range srvKeyspace.ServedFrom {
		if sf.TabletType == tabletType {
			keyspace = sf.Keyspace
			srvKeyspace, err = topoServ.GetSrvKeyspace(ctx, cell, keyspace)
			if err != nil {
				return "", nil, nil, vterrors.NewVitessError(
					vtrpc.ErrorCode_INTERNAL_ERROR, err,
					"keyspace %v fetch error: %v", keyspace, err,
				)
			}
		}
	}

	partition := topoproto.SrvKeyspaceGetPartition(srvKeyspace, tabletType)
	if partition == nil {
		return "", nil, nil, vterrors.NewVitessError(
			vtrpc.ErrorCode_INTERNAL_ERROR, err,
			"No partition found for tabletType %v in keyspace %v", strings.ToLower(tabletType.String()), keyspace,
		)
	}
	return keyspace, srvKeyspace, partition.ShardReferences, nil
}
Пример #2
0
func getKeyspaceShards(ctx context.Context, topoServ SrvTopoServer, cell, keyspace string, tabletType pb.TabletType) (string, *topo.SrvKeyspace, []topo.ShardReference, error) {
	srvKeyspace, err := topoServ.GetSrvKeyspace(ctx, cell, keyspace)
	if err != nil {
		return "", nil, nil, vterrors.NewVitessError(
			vtrpc.ErrorCode_INTERNAL_ERROR, err,
			"keyspace %v fetch error: %v", keyspace, err,
		)
	}

	// check if the keyspace has been redirected for this tabletType.
	tt := topo.ProtoToTabletType(tabletType)
	if servedFrom, ok := srvKeyspace.ServedFrom[tt]; ok {
		keyspace = servedFrom
		srvKeyspace, err = topoServ.GetSrvKeyspace(ctx, cell, keyspace)
		if err != nil {
			return "", nil, nil, vterrors.NewVitessError(
				vtrpc.ErrorCode_INTERNAL_ERROR, err,
				"keyspace %v fetch error: %v", keyspace, err,
			)
		}
	}

	partition, ok := srvKeyspace.Partitions[tt]
	if !ok {
		return "", nil, nil, vterrors.NewVitessError(
			vtrpc.ErrorCode_INTERNAL_ERROR, err,
			"No partition found for tabletType %v in keyspace %v", strings.ToLower(tabletType.String()), keyspace,
		)
	}
	return keyspace, srvKeyspace, partition.ShardReferences, nil
}
Пример #3
0
// getNewConn creates a new tablet connection with a separate per conn timeout.
// It limits the overall timeout to connTimeoutTotal by checking elapsed time after each blocking call.
func (sdc *ShardConn) getNewConn(ctx context.Context) (conn tabletconn.TabletConn, endPoint *topodatapb.EndPoint, isTimeout bool, err error) {
	startTime := time.Now()

	endPoints, err := sdc.balancer.Get()
	if err != nil {
		// Error when getting endpoint
		return nil, nil, false, err
	}
	if len(endPoints) == 0 {
		// No valid endpoint
		return nil, nil, false, vterrors.FromError(
			vtrpcpb.ErrorCode_INTERNAL_ERROR,
			fmt.Errorf("no valid endpoint"),
		)
	}
	if time.Now().Sub(startTime) >= sdc.connTimeoutTotal {
		return nil, nil, true, vterrors.FromError(
			vtrpcpb.ErrorCode_DEADLINE_EXCEEDED,
			fmt.Errorf("timeout when getting endpoints"),
		)
	}

	// Iterate through all endpoints to create a connection
	perConnTimeout := sdc.getConnTimeoutPerConn(len(endPoints))
	allErrors := new(concurrency.AllErrorRecorder)
	for _, endPoint := range endPoints {
		perConnStartTime := time.Now()
		conn, err = tabletconn.GetDialer()(ctx, endPoint, sdc.keyspace, sdc.shard, topodatapb.TabletType_UNKNOWN, perConnTimeout)
		if err == nil {
			sdc.connectTimings.Record([]string{sdc.keyspace, sdc.shard, strings.ToLower(sdc.tabletType.String())}, perConnStartTime)
			sdc.mu.Lock()
			defer sdc.mu.Unlock()
			sdc.conn = conn
			return conn, endPoint, false, nil
		}
		// Markdown the endpoint if it failed to connect
		sdc.balancer.MarkDown(endPoint.Uid, err.Error())
		vtErr := vterrors.NewVitessError(
			// TODO(aaijazi): what about OperationalErrors here?
			vterrors.RecoverVtErrorCode(err), err,
			"%v %+v", err, endPoint,
		)
		allErrors.RecordError(vtErr)
		if time.Now().Sub(startTime) >= sdc.connTimeoutTotal {
			err = vterrors.FromError(
				vtrpcpb.ErrorCode_DEADLINE_EXCEEDED,
				fmt.Errorf("timeout when connecting to %+v", endPoint),
			)
			allErrors.RecordError(err)
			return nil, nil, true, allErrors.AggrError(AggregateVtGateErrors)
		}
	}
	return nil, nil, false, allErrors.Error()
}
Пример #4
0
func gRPCVtworkerClientFactory(addr string, dialTimeout time.Duration) (vtworkerclient.Client, error) {
	// create the RPC client
	opt, err := grpcutils.ClientSecureDialOption(*cert, *key, *ca, *name)
	if err != nil {
		return nil, err
	}
	cc, err := grpc.Dial(addr, opt, grpc.WithBlock(), grpc.WithTimeout(dialTimeout))
	if err != nil {
		return nil, vterrors.NewVitessError(vtrpcpb.ErrorCode_DEADLINE_EXCEEDED, err, "grpc.Dial() err: %v", err)
	}
	c := vtworkerservicepb.NewVtworkerClient(cc)

	return &gRPCVtworkerClient{
		cc: cc,
		c:  c,
	}, nil
}
Пример #5
0
// NewShardConn creates a new ShardConn. It creates a Balancer using
// serv, cell, keyspace, tabletType and retryDelay. retryCount is the max
// number of retries before a ShardConn returns an error on an operation.
func NewShardConn(ctx context.Context, serv topo.SrvTopoServer, cell, keyspace, shard string, tabletType topodatapb.TabletType, retryDelay time.Duration, retryCount int, connTimeoutTotal, connTimeoutPerConn, connLife time.Duration, tabletConnectTimings *stats.MultiTimings) *ShardConn {
	getAddresses := func() (*topodatapb.EndPoints, error) {
		endpoints, _, err := serv.GetEndPoints(ctx, cell, keyspace, shard, tabletType)
		if err != nil {
			return nil, vterrors.NewVitessError(
				vtrpcpb.ErrorCode_INTERNAL_ERROR, err,
				"endpoints fetch error: %v", err,
			)
		}
		return endpoints, nil
	}
	blc := NewBalancer(getAddresses, retryDelay)
	var ticker *timer.RandTicker
	if tabletType != topodatapb.TabletType_MASTER {
		ticker = timer.NewRandTicker(connLife, connLife/2)
	}
	sdc := &ShardConn{
		keyspace:           keyspace,
		shard:              shard,
		tabletType:         tabletType,
		retryDelay:         retryDelay,
		retryCount:         retryCount,
		connTimeoutTotal:   connTimeoutTotal,
		connTimeoutPerConn: connTimeoutPerConn,
		connLife:           connLife,
		balancer:           blc,
		ticker:             ticker,
		consolidator:       sync2.NewConsolidator(),
		connectTimings:     tabletConnectTimings,
	}
	if ticker != nil {
		go func() {
			for range ticker.C {
				sdc.closeCurrent()
			}
		}()
	}
	return sdc
}
Пример #6
0
// setAndStartWorker will set the current worker.
// We always log to both memory logger (for display on the web) and
// console logger (for records / display of command line worker).
func (wi *Instance) setAndStartWorker(ctx context.Context, wrk Worker, wr *wrangler.Wrangler) (chan struct{}, error) {
	wi.currentWorkerMutex.Lock()
	defer wi.currentWorkerMutex.Unlock()

	if wi.currentContext != nil {
		return nil, vterrors.FromError(vtrpcpb.ErrorCode_TRANSIENT_ERROR,
			fmt.Errorf("A worker job is already in progress: %v", wi.currentWorker.StatusAsText()))
	}

	if wi.currentWorker != nil {
		// During the grace period, we answer with a retryable error.
		const gracePeriod = 1 * time.Minute
		gracePeriodEnd := time.Now().Add(gracePeriod)
		if wi.lastRunStopTime.Before(gracePeriodEnd) {
			return nil, vterrors.FromError(vtrpcpb.ErrorCode_TRANSIENT_ERROR,
				fmt.Errorf("A worker job was recently stopped (%f seconds ago): %v",
					time.Now().Sub(wi.lastRunStopTime).Seconds(),
					wi.currentWorker))
		}

		// QUERY_NOT_SERVED = FailedPrecondition => manual resolution required.
		return nil, vterrors.FromError(vtrpcpb.ErrorCode_QUERY_NOT_SERVED,
			fmt.Errorf("The worker job was stopped %.1f minutes ago, but not reset. You have to reset it manually. Job: %v",
				time.Now().Sub(wi.lastRunStopTime).Minutes(),
				wi.currentWorker))
	}

	wi.currentWorker = wrk
	wi.currentMemoryLogger = logutil.NewMemoryLogger()
	wi.currentContext, wi.currentCancelFunc = context.WithCancel(ctx)
	wi.lastRunError = nil
	wi.lastRunStopTime = time.Unix(0, 0)
	done := make(chan struct{})
	wranglerLogger := wr.Logger()
	if wr == wi.wr {
		// If it's the default wrangler, do not reuse its logger because it may have been set before.
		// Resuing it would result into an endless recursion.
		wranglerLogger = logutil.NewConsoleLogger()
	}
	wr.SetLogger(logutil.NewTeeLogger(wi.currentMemoryLogger, wranglerLogger))

	// one go function runs the worker, changes state when done
	go func() {
		log.Infof("Starting worker...")
		var err error

		// Catch all panics and always save the execution state at the end.
		defer func() {
			// The recovery code is a copy of servenv.HandlePanic().
			if x := recover(); x != nil {
				log.Errorf("uncaught vtworker panic: %v\n%s", x, tb.Stack(4))
				err = fmt.Errorf("uncaught vtworker panic: %v", x)
			}

			wi.currentWorkerMutex.Lock()
			wi.currentContext = nil
			wi.currentCancelFunc = nil
			wi.lastRunError = err
			wi.lastRunStopTime = time.Now()
			wi.currentWorkerMutex.Unlock()
			close(done)
		}()

		// run will take a long time
		err = wrk.Run(wi.currentContext)

		// If the context was canceled, include the respective error code.
		select {
		case <-wi.currentContext.Done():
			// Context is done i.e. probably canceled.
			if wi.currentContext.Err() == context.Canceled {
				err = vterrors.NewVitessError(vtrpcpb.ErrorCode_CANCELLED, err, "vtworker command was canceled: %v", err)
			}
		default:
		}
	}()

	return done, nil
}