// trimmedRequestToError returns an error for a trimmed request by looking at the // requested error type. It assumes that prefix checking has already been done. // If the received string doesn't match a known error, returns an unknown error. func trimmedRequestToError(received string) error { switch received { case "integrity error": return vterrors.FromError( vtrpc.ErrorCode_INTEGRITY_ERROR, errors.New("vtgate test client forced error: integrity error (errno 1062)"), ) // request backlog and general throttling type errors case "transient error": return vterrors.FromError( vtrpc.ErrorCode_TRANSIENT_ERROR, errors.New("request_backlog: too many requests in flight: vtgate test client forced error"), ) case "unknown error": return vterrors.FromError( vtrpc.ErrorCode_UNKNOWN_ERROR, errors.New("vtgate test client forced error: unknown error"), ) default: return vterrors.FromError( vtrpc.ErrorCode_UNKNOWN_ERROR, fmt.Errorf("vtgate test client error request unrecognized: %v", received), ) } }
// Commit commits the current transaction. There are no retries on this operation. func (stc *ScatterConn) Commit(ctx context.Context, session *SafeSession) (err error) { if session == nil { return vterrors.FromError( vtrpcpb.ErrorCode_BAD_INPUT, fmt.Errorf("cannot commit: empty session"), ) } if !session.InTransaction() { return vterrors.FromError( vtrpcpb.ErrorCode_NOT_IN_TX, fmt.Errorf("cannot commit: not in transaction"), ) } committing := true for _, shardSession := range session.ShardSessions { if !committing { stc.gateway.Rollback(ctx, shardSession.Target.Keyspace, shardSession.Target.Shard, shardSession.Target.TabletType, shardSession.TransactionId) continue } if err = stc.gateway.Commit(ctx, shardSession.Target.Keyspace, shardSession.Target.Shard, shardSession.Target.TabletType, shardSession.TransactionId); err != nil { committing = false } } session.Reset() return err }
// trimmedRequestToError returns an error for a trimmed request by looking at the // requested error type. It assumes that prefix checking has already been done. // If the received string doesn't match a known error, returns an unknown error. func trimmedRequestToError(received string) error { switch received { case "bad input": return vterrors.FromError( vtrpcpb.ErrorCode_BAD_INPUT, errors.New("vtgate test client forced error: bad input"), ) case "deadline exceeded": return vterrors.FromError( vtrpcpb.ErrorCode_DEADLINE_EXCEEDED, errors.New("vtgate test client forced error: deadline exceeded"), ) case "integrity error": return vterrors.FromError( vtrpcpb.ErrorCode_INTEGRITY_ERROR, errors.New("vtgate test client forced error: integrity error (errno 1062) (sqlstate 23000)"), ) // request backlog and general throttling type errors case "transient error": return vterrors.FromError( vtrpcpb.ErrorCode_TRANSIENT_ERROR, errors.New("request_backlog: too many requests in flight: vtgate test client forced error: transient error"), ) case "throttled error": return vterrors.FromError( vtrpcpb.ErrorCode_TRANSIENT_ERROR, errors.New("request_backlog: exceeded XXX quota, rate limiting: vtgate test client forced error: transient error"), ) case "unauthenticated": return vterrors.FromError( vtrpcpb.ErrorCode_UNAUTHENTICATED, errors.New("vtgate test client forced error: unauthenticated"), ) case "aborted": return vterrors.FromError( vtrpcpb.ErrorCode_NOT_IN_TX, errors.New("vtgate test client forced error: aborted"), ) case "query not served": return vterrors.FromError( vtrpcpb.ErrorCode_QUERY_NOT_SERVED, errors.New("vtgate test client forced error: query not served"), ) case "unknown error": return vterrors.FromError( vtrpcpb.ErrorCode_UNKNOWN_ERROR, errors.New("vtgate test client forced error: unknown error"), ) default: return vterrors.FromError( vtrpcpb.ErrorCode_UNKNOWN_ERROR, fmt.Errorf("vtgate test client error request unrecognized: %v", received), ) } }
// withRetry gets available connections and executes the action. If there are retryable errors, // it retries retryCount times before failing. It does not retry if the connection is in // the middle of a transaction. While returning the error check if it maybe a result of // a resharding event, and set the re-resolve bit and let the upper layers // re-resolve and retry. func (dg *discoveryGateway) withRetry(ctx context.Context, keyspace, shard string, tabletType topodatapb.TabletType, action func(conn tabletconn.TabletConn, target *querypb.Target) error, transactionID int64, isStreaming bool) error { var tabletLastUsed *topodatapb.Tablet var err error inTransaction := (transactionID != 0) invalidTablets := make(map[string]bool) for i := 0; i < dg.retryCount+1; i++ { tablets := dg.getTablets(keyspace, shard, tabletType) if len(tablets) == 0 { // fail fast if there is no tablet err = vterrors.FromError(vtrpcpb.ErrorCode_INTERNAL_ERROR, fmt.Errorf("no valid tablet")) break } shuffleTablets(tablets) // skip tablets we tried before var ts *discovery.TabletStats for _, t := range tablets { if _, ok := invalidTablets[discovery.TabletToMapKey(t.Tablet)]; !ok { ts = t break } } if ts == nil { if err == nil { // do not override error from last attempt. err = vterrors.FromError(vtrpcpb.ErrorCode_INTERNAL_ERROR, fmt.Errorf("no available connection")) } break } // execute tabletLastUsed = ts.Tablet conn := dg.hc.GetConnection(ts.Tablet) if conn == nil { err = vterrors.FromError(vtrpcpb.ErrorCode_INTERNAL_ERROR, fmt.Errorf("no connection for %+v", ts.Tablet)) invalidTablets[discovery.TabletToMapKey(ts.Tablet)] = true continue } // Potentially buffer this request. if bufferErr := masterbuffer.FakeBuffer(keyspace, shard, tabletType, inTransaction, i); bufferErr != nil { return bufferErr } err = action(conn, ts.Target) if dg.canRetry(ctx, err, transactionID, isStreaming) { invalidTablets[discovery.TabletToMapKey(ts.Tablet)] = true continue } break } return NewShardError(err, keyspace, shard, tabletType, tabletLastUsed, inTransaction) }
// getNewConn creates a new tablet connection with a separate per conn timeout. // It limits the overall timeout to connTimeoutTotal by checking elapsed time after each blocking call. func (sdc *ShardConn) getNewConn(ctx context.Context) (conn tabletconn.TabletConn, endPoint *topodatapb.EndPoint, isTimeout bool, err error) { startTime := time.Now() endPoints, err := sdc.balancer.Get() if err != nil { // Error when getting endpoint return nil, nil, false, err } if len(endPoints) == 0 { // No valid endpoint return nil, nil, false, vterrors.FromError( vtrpcpb.ErrorCode_INTERNAL_ERROR, fmt.Errorf("no valid endpoint"), ) } if time.Now().Sub(startTime) >= sdc.connTimeoutTotal { return nil, nil, true, vterrors.FromError( vtrpcpb.ErrorCode_DEADLINE_EXCEEDED, fmt.Errorf("timeout when getting endpoints"), ) } // Iterate through all endpoints to create a connection perConnTimeout := sdc.getConnTimeoutPerConn(len(endPoints)) allErrors := new(concurrency.AllErrorRecorder) for _, endPoint := range endPoints { perConnStartTime := time.Now() conn, err = tabletconn.GetDialer()(ctx, endPoint, sdc.keyspace, sdc.shard, topodatapb.TabletType_UNKNOWN, perConnTimeout) if err == nil { sdc.connectTimings.Record([]string{sdc.keyspace, sdc.shard, strings.ToLower(sdc.tabletType.String())}, perConnStartTime) sdc.mu.Lock() defer sdc.mu.Unlock() sdc.conn = conn return conn, endPoint, false, nil } // Markdown the endpoint if it failed to connect sdc.balancer.MarkDown(endPoint.Uid, err.Error()) vtErr := vterrors.NewVitessError( // TODO(aaijazi): what about OperationalErrors here? vterrors.RecoverVtErrorCode(err), err, "%v %+v", err, endPoint, ) allErrors.RecordError(vtErr) if time.Now().Sub(startTime) >= sdc.connTimeoutTotal { err = vterrors.FromError( vtrpcpb.ErrorCode_DEADLINE_EXCEEDED, fmt.Errorf("timeout when connecting to %+v", endPoint), ) allErrors.RecordError(err) return nil, nil, true, allErrors.AggrError(AggregateVtGateErrors) } } return nil, nil, false, allErrors.Error() }
// withRetry gets available connections and executes the action. If there are retryable errors, // it retries retryCount times before failing. It does not retry if the connection is in // the middle of a transaction. While returning the error check if it maybe a result of // a resharding event, and set the re-resolve bit and let the upper layers // re-resolve and retry. func (dg *discoveryGateway) withRetry(ctx context.Context, keyspace, shard string, tabletType pbt.TabletType, action func(conn tabletconn.TabletConn) error, transactionID int64, isStreaming bool) error { var endPointLastUsed *pbt.EndPoint var err error inTransaction := (transactionID != 0) invalidEndPoints := make(map[string]bool) for i := 0; i < dg.retryCount+1; i++ { var endPoint *pbt.EndPoint endPoints := dg.getEndPoints(keyspace, shard, tabletType) if len(endPoints) == 0 { // fail fast if there is no endpoint err = vterrors.FromError(vtrpc.ErrorCode_INTERNAL_ERROR, fmt.Errorf("no valid endpoint")) break } shuffleEndPoints(endPoints) // skip endpoints we tried before for _, ep := range endPoints { if _, ok := invalidEndPoints[discovery.EndPointToMapKey(ep)]; !ok { endPoint = ep break } } if endPoint == nil { if err == nil { // do not override error from last attempt. err = vterrors.FromError(vtrpc.ErrorCode_INTERNAL_ERROR, fmt.Errorf("no available connection")) } break } // execute endPointLastUsed = endPoint conn := dg.hc.GetConnection(endPoint) if conn == nil { err = vterrors.FromError(vtrpc.ErrorCode_INTERNAL_ERROR, fmt.Errorf("no connection for %+v", endPoint)) invalidEndPoints[discovery.EndPointToMapKey(endPoint)] = true continue } err = action(conn) if dg.canRetry(ctx, err, transactionID, isStreaming) { invalidEndPoints[discovery.EndPointToMapKey(endPoint)] = true continue } break } return WrapError(err, keyspace, shard, tabletType, endPointLastUsed, inTransaction) }
func getShardForKeyspaceID(allShards []*pb.ShardReference, keyspaceID []byte) (string, error) { if len(allShards) == 0 { return "", vterrors.FromError(vtrpc.ErrorCode_BAD_INPUT, fmt.Errorf("No shards found for this tabletType"), ) } for _, shardReference := range allShards { if key.KeyRangeContains(shardReference.KeyRange, keyspaceID) { return shardReference.Name, nil } } return "", vterrors.FromError(vtrpc.ErrorCode_BAD_INPUT, fmt.Errorf("KeyspaceId %v didn't match any shards %+v", hex.EncodeToString(keyspaceID), allShards), ) }
func TestAggregateVtGateErrors(t *testing.T) { var testcases = []struct { input []error expected error }{ { input: nil, expected: nil, }, { input: []error{ errFromCode(vtrpc.ErrorCode_SUCCESS), errFromCode(vtrpc.ErrorCode_TRANSIENT_ERROR), errFromCode(vtrpc.ErrorCode_BAD_INPUT), }, expected: vterrors.FromError( vtrpc.ErrorCode_BAD_INPUT, vterrors.ConcatenateErrors([]error{errGeneric, errGeneric, errGeneric}), ), }, } for _, tc := range testcases { out := aggregateVtGateErrors(tc.input) if !reflect.DeepEqual(out, tc.expected) { t.Errorf("aggregateVtGateErrors(%+v) = %+v \nwant: %+v", tc.input, out, tc.expected) } } }
func aggregateVtGateErrors(errors []error) error { if len(errors) == 0 { return nil } return vterrors.FromError( aggregateVtGateErrorCodes(errors), vterrors.ConcatenateErrors(errors), ) }
func getAnyShard(ctx context.Context, topoServ topo.SrvTopoServer, cell, keyspace string, tabletType topodatapb.TabletType) (ks, shard string, err error) { keyspace, _, allShards, err := getKeyspaceShards(ctx, topoServ, cell, keyspace, tabletType) if err != nil { return "", "", err } if len(allShards) == 0 { return "", "", vterrors.FromError(vtrpcpb.ErrorCode_BAD_INPUT, fmt.Errorf("No shards found for this tabletType"), ) } return keyspace, allShards[0].Name, nil }
// ExecuteKeyspaceIds executes a non-streaming query based on KeyspaceIds. // It retries query if new keyspace/shards are re-resolved after a retryable error. // This throws an error if a dml spans multiple keyspace_ids. Resharding depends // on being able to uniquely route a write. func (res *Resolver) ExecuteKeyspaceIds(ctx context.Context, sql string, bindVariables map[string]interface{}, keyspace string, keyspaceIds [][]byte, tabletType pb.TabletType, session *proto.Session, notInTransaction bool) (*mproto.QueryResult, error) { if isDml(sql) && len(keyspaceIds) > 1 { return nil, vterrors.FromError( vtrpc.ErrorCode_BAD_INPUT, fmt.Errorf("DML should not span multiple keyspace_ids"), ) } mapToShards := func(k string) (string, []string, error) { return mapKeyspaceIdsToShards( ctx, res.toposerv, res.cell, k, tabletType, keyspaceIds) } return res.Execute(ctx, sql, bindVariables, keyspace, tabletType, session, mapToShards, notInTransaction) }
// ExecuteKeyspaceIds executes a non-streaming query based on KeyspaceIds. // It retries query if new keyspace/shards are re-resolved after a retryable error. // This throws an error if a dml spans multiple keyspace_ids. Resharding depends // on being able to uniquely route a write. func (res *Resolver) ExecuteKeyspaceIds(ctx context.Context, sql string, bindVariables map[string]interface{}, keyspace string, keyspaceIds [][]byte, tabletType topodatapb.TabletType, session *vtgatepb.Session, notInTransaction bool, options *querypb.ExecuteOptions) (*sqltypes.Result, error) { if sqlannotation.IsDML(sql) && len(keyspaceIds) > 1 { return nil, vterrors.FromError( vtrpcpb.ErrorCode_BAD_INPUT, fmt.Errorf("DML should not span multiple keyspace_ids"), ) } mapToShards := func(k string) (string, []string, error) { return mapKeyspaceIdsToShards( ctx, res.toposerv, res.cell, k, tabletType, keyspaceIds) } return res.Execute(ctx, sql, bindVariables, keyspace, tabletType, session, mapToShards, notInTransaction, options) }
func (blc *Balancer) refresh() error { endPoints, err := blc.getEndPoints() if err != nil { return err } // Add new addressNodes if endPoints != nil { for _, endPoint := range endPoints.Entries { if index := findAddrNode(blc.addressNodes, endPoint.Uid); index == -1 { addrNode := &addressStatus{ endPoint: endPoint, balancer: blc, } blc.addressNodes = append(blc.addressNodes, addrNode) } else { blc.addressNodes[index].endPoint = endPoint } } } // Remove those that went away i := 0 for i < len(blc.addressNodes) { if index := findAddress(endPoints, blc.addressNodes[i].endPoint.Uid); index == -1 { blc.addressNodes = delAddrNode(blc.addressNodes, i) continue } i++ } if len(blc.addressNodes) == 0 { return vterrors.FromError( vtrpc.ErrorCode_INTERNAL_ERROR, fmt.Errorf("no available addresses"), ) } // Sort endpoints by timeRetry (from ZERO to largest) sort.Sort(AddressList(blc.addressNodes)) // Randomize endpoints with ZERO timeRetry shuffle(blc.addressNodes, findFirstAddrNodeNonZeroTimeRetry(blc.addressNodes)) return nil }
// mapExactShards maps a keyrange to shards only if there's a complete // match. If there's any partial match the function returns no match. func mapExactShards(ctx context.Context, topoServ SrvTopoServer, cell, keyspace string, tabletType pb.TabletType, kr *pb.KeyRange) (newkeyspace string, shards []string, err error) { keyspace, _, allShards, err := getKeyspaceShards(ctx, topoServ, cell, keyspace, tabletType) if err != nil { return "", nil, err } shardnum := 0 for shardnum < len(allShards) { if bytes.Compare(kr.Start, []byte(allShards[shardnum].KeyRange.Start)) == 0 { break } shardnum++ } for shardnum < len(allShards) { shards = append(shards, allShards[shardnum].Name) if bytes.Compare(kr.End, []byte(allShards[shardnum].KeyRange.End)) == 0 { return keyspace, shards, nil } shardnum++ } return keyspace, nil, vterrors.FromError(vtrpc.ErrorCode_BAD_INPUT, fmt.Errorf("keyrange %v does not exactly match shards", key.KeyRangeString(kr)), ) }
const errOutOfRange = "errno 1264" const errTxPoolFull = "tx_pool_full" var ( rpcVTGate *VTGate qpsByOperation *stats.Rates qpsByKeyspace *stats.Rates qpsByDbType *stats.Rates errorsByOperation *stats.Rates errorsByKeyspace *stats.Rates errorsByDbType *stats.Rates errTooManyInFlight = vterrors.FromError( vtrpc.ErrorCode_TRANSIENT_ERROR, errors.New("request_backlog: too many requests in flight"), ) // Error counters should be global so they can be set from anywhere normalErrors *stats.MultiCounters infoErrors *stats.Counters internalErrors *stats.Counters ) // VTGate is the rpc interface to vtgate. Only one instance // can be created. It implements vtgateservice.VTGateService type VTGate struct { resolver *Resolver router *Router timings *stats.MultiTimings rowsReturned *stats.MultiCounters
func errFromCode(c vtrpc.ErrorCode) error { return vterrors.FromError(c, errGeneric) }
maxBufferSize = flag.Int("max_buffer_size", 10, "The maximum number of master requests to buffer at a time.") fakeBufferDelay = flag.Duration("fake_buffer_delay", 1*time.Second, "The amount of time that we should delay all master requests for, to fake a buffer.") bufferedRequestsAttempted = stats.NewInt("BufferedRequestsAttempted") bufferedRequestsSuccessful = stats.NewInt("BufferedRequestsSuccessful") // Use this lock when adding to the number of currently buffered requests. bufferMu sync.Mutex bufferedRequests = stats.NewInt("BufferedRequests") ) // timeSleep can be mocked out in unit tests var timeSleep = time.Sleep // errBufferFull is the error returned a buffer request is rejected because the buffer is full. var errBufferFull = vterrors.FromError( vtrpcpb.ErrorCode_TRANSIENT_ERROR, errors.New("master request buffer full, rejecting request"), ) // FakeBuffer will pretend to buffer master requests in VTGate. // Requests *will NOT actually be buffered*, they will just be delayed. // This can be useful to understand what the impact of master request buffering will be // on upstream callers. Once the impact is measured, it can be used to tweak parameter values // for the best behavior. // FakeBuffer should be called before a potential VtTablet Begin, otherwise it will increase transaction times. func FakeBuffer(keyspace, shard string, tabletType topodatapb.TabletType, inTransaction bool, attemptNumber int) error { if !*enableFakeMasterBuffer { return nil } // Don't buffer non-master traffic, requests that are inside transactions, or retries. if tabletType != topodatapb.TabletType_MASTER || inTransaction || attemptNumber != 0 { return nil
// setAndStartWorker will set the current worker. // We always log to both memory logger (for display on the web) and // console logger (for records / display of command line worker). func (wi *Instance) setAndStartWorker(wrk Worker, wr *wrangler.Wrangler) (chan struct{}, error) { wi.currentWorkerMutex.Lock() defer wi.currentWorkerMutex.Unlock() if wi.currentContext != nil { return nil, vterrors.FromError(vtrpcpb.ErrorCode_TRANSIENT_ERROR, fmt.Errorf("A worker job is already in progress: %v", wi.currentWorker)) } if wi.currentWorker != nil { // During the grace period, we answer with a retryable error. const gracePeriod = 1 * time.Minute gracePeriodEnd := time.Now().Add(gracePeriod) if wi.lastRunStopTime.Before(gracePeriodEnd) { return nil, vterrors.FromError(vtrpcpb.ErrorCode_TRANSIENT_ERROR, fmt.Errorf("A worker job was recently stopped (%f seconds ago): %v", time.Now().Sub(wi.lastRunStopTime).Seconds(), wi.currentWorker)) } // QUERY_NOT_SERVED = FailedPrecondition => manual resolution required. return nil, vterrors.FromError(vtrpcpb.ErrorCode_QUERY_NOT_SERVED, fmt.Errorf("The worker job was stopped %.1f minutes ago, but not reset. You have to reset it manually. Job: %v", time.Now().Sub(wi.lastRunStopTime).Minutes(), wi.currentWorker)) } wi.currentWorker = wrk wi.currentMemoryLogger = logutil.NewMemoryLogger() wi.currentContext, wi.currentCancelFunc = context.WithCancel(wi.backgroundContext) wi.lastRunError = nil wi.lastRunStopTime = time.Unix(0, 0) done := make(chan struct{}) wranglerLogger := wr.Logger() if wr == wi.wr { // If it's the default wrangler, do not reuse its logger because it may have been set before. // Resuing it would result into an endless recursion. wranglerLogger = logutil.NewConsoleLogger() } wr.SetLogger(logutil.NewTeeLogger(wi.currentMemoryLogger, wranglerLogger)) // one go function runs the worker, changes state when done go func() { log.Infof("Starting worker...") var err error // Catch all panics and always save the execution state at the end. defer func() { // The recovery code is a copy of servenv.HandlePanic(). if x := recover(); x != nil { log.Errorf("uncaught vtworker panic: %v\n%s", x, tb.Stack(4)) err = fmt.Errorf("uncaught vtworker panic: %v", x) } wi.currentWorkerMutex.Lock() wi.currentContext = nil wi.currentCancelFunc = nil wi.lastRunError = err wi.lastRunStopTime = time.Now() wi.currentWorkerMutex.Unlock() close(done) }() // run will take a long time err = wrk.Run(wi.currentContext) }() return done, nil }