// TODO(mberlin): Discuss with the team if it should go to the vterrors package. // TODO(mberlin): Add other error codes here as well? func isRetryable(err error) bool { switch vterrors.RecoverVtErrorCode(err) { case vtrpcpb.ErrorCode_TRANSIENT_ERROR: return true default: return false } }
func (stc *ScatterConn) rollbackIfNeeded(ctx context.Context, err error, session *SafeSession) { if session.InTransaction() { ec := vterrors.RecoverVtErrorCode(err) if ec == vtrpcpb.ErrorCode_RESOURCE_EXHAUSTED || ec == vtrpcpb.ErrorCode_NOT_IN_TX { // We cannot recover from these errors stc.Rollback(ctx, session) } } }
// rpcErrFromTabletError translate an error from VTGate to an *mproto.RPCError func rpcErrFromVtGateError(err error) *mproto.RPCError { if err == nil { return nil } return &mproto.RPCError{ Code: int64(vterrors.RecoverVtErrorCode(err)), Message: err.Error(), } }
// aggregateVtGateErrorCodes aggregates a list of errors into a single error code. // It does so by finding the highest priority error code in the list. func aggregateVtGateErrorCodes(errors []error) vtrpc.ErrorCode { highCode := vtrpc.ErrorCode_SUCCESS for _, e := range errors { code := vterrors.RecoverVtErrorCode(e) if errorPriorities[code] > errorPriorities[highCode] { highCode = code } } return highCode }
// Verifies the returned error has the properties that we expect. func verifyError(t *testing.T, err error, method string) { if err == nil { t.Errorf("%s was expecting an error, didn't get one", method) return } code := vterrors.RecoverVtErrorCode(err) if code != expectedCode { t.Errorf("Unexpected server code from %s: got %v, wanted %v", method, code, expectedCode) } verifyErrorExceptServerCode(t, err, method) }
// getNewConn creates a new tablet connection with a separate per conn timeout. // It limits the overall timeout to connTimeoutTotal by checking elapsed time after each blocking call. func (sdc *ShardConn) getNewConn(ctx context.Context) (conn tabletconn.TabletConn, endPoint *topodatapb.EndPoint, isTimeout bool, err error) { startTime := time.Now() endPoints, err := sdc.balancer.Get() if err != nil { // Error when getting endpoint return nil, nil, false, err } if len(endPoints) == 0 { // No valid endpoint return nil, nil, false, vterrors.FromError( vtrpcpb.ErrorCode_INTERNAL_ERROR, fmt.Errorf("no valid endpoint"), ) } if time.Now().Sub(startTime) >= sdc.connTimeoutTotal { return nil, nil, true, vterrors.FromError( vtrpcpb.ErrorCode_DEADLINE_EXCEEDED, fmt.Errorf("timeout when getting endpoints"), ) } // Iterate through all endpoints to create a connection perConnTimeout := sdc.getConnTimeoutPerConn(len(endPoints)) allErrors := new(concurrency.AllErrorRecorder) for _, endPoint := range endPoints { perConnStartTime := time.Now() conn, err = tabletconn.GetDialer()(ctx, endPoint, sdc.keyspace, sdc.shard, topodatapb.TabletType_UNKNOWN, perConnTimeout) if err == nil { sdc.connectTimings.Record([]string{sdc.keyspace, sdc.shard, strings.ToLower(sdc.tabletType.String())}, perConnStartTime) sdc.mu.Lock() defer sdc.mu.Unlock() sdc.conn = conn return conn, endPoint, false, nil } // Markdown the endpoint if it failed to connect sdc.balancer.MarkDown(endPoint.Uid, err.Error()) vtErr := vterrors.NewVitessError( // TODO(aaijazi): what about OperationalErrors here? vterrors.RecoverVtErrorCode(err), err, "%v %+v", err, endPoint, ) allErrors.RecordError(vtErr) if time.Now().Sub(startTime) >= sdc.connTimeoutTotal { err = vterrors.FromError( vtrpcpb.ErrorCode_DEADLINE_EXCEEDED, fmt.Errorf("timeout when connecting to %+v", endPoint), ) allErrors.RecordError(err) return nil, nil, true, allErrors.AggrError(AggregateVtGateErrors) } } return nil, nil, false, allErrors.Error() }
func verifyShardConnError(t *testing.T, err error, wantErr string, wantCode vtrpcpb.ErrorCode) { if err == nil || err.Error() != wantErr { t.Errorf("wanted error: %s, got error: %v", wantErr, err) } if _, ok := err.(*ShardConnError); !ok { t.Errorf("wanted error type *ShardConnError, got error type: %v", reflect.TypeOf(err)) } code := vterrors.RecoverVtErrorCode(err) if code != wantCode { t.Errorf("wanted error code: %s, got: %v", wantCode, code) } }
func (l *L2VTGate) endAction(startTime time.Time, statsKey []string, err *error) { if *err != nil { // Don't increment the error counter for duplicate // keys or bad queries, as those errors are caused by // client queries and are not VTGate's fault. ec := vterrors.RecoverVtErrorCode(*err) if ec != vtrpcpb.ErrorCode_INTEGRITY_ERROR && ec != vtrpcpb.ErrorCode_BAD_INPUT { l.tabletCallErrorCount.Add(statsKey, 1) } } l.timings.Record(statsKey, startTime) }
// testErrorHelper will check one instance of each error type, // to make sure we propagate the errors properly. func testErrorHelper(t *testing.T, f *FakeQueryService, name string, ef func(context.Context) error) { errors := []*tabletserver.TabletError{ // A few generic errors tabletserver.NewTabletError(vtrpcpb.ErrorCode_BAD_INPUT, "generic error"), tabletserver.NewTabletError(vtrpcpb.ErrorCode_UNKNOWN_ERROR, "uncaught panic"), tabletserver.NewTabletError(vtrpcpb.ErrorCode_UNAUTHENTICATED, "missing caller id"), tabletserver.NewTabletError(vtrpcpb.ErrorCode_PERMISSION_DENIED, "table acl error: nil acl"), // Client will retry on this specific error tabletserver.NewTabletError(vtrpcpb.ErrorCode_QUERY_NOT_SERVED, "Query disallowed due to rule: %v", "cool rule"), // Client may retry on another server on this specific error tabletserver.NewTabletError(vtrpcpb.ErrorCode_INTERNAL_ERROR, "Could not verify strict mode"), // This is usually transaction pool full tabletserver.NewTabletError(vtrpcpb.ErrorCode_RESOURCE_EXHAUSTED, "Transaction pool connection limit exceeded"), // Transaction expired or was unknown tabletserver.NewTabletError(vtrpcpb.ErrorCode_NOT_IN_TX, "Transaction 12"), } for _, e := range errors { f.TabletError = e ctx := context.Background() err := ef(ctx) if err == nil { t.Errorf("error wasn't returned for %v?", name) continue } // First we check the recoverable vtrpc code is right. code := vterrors.RecoverVtErrorCode(err) if code != e.ErrorCode { t.Errorf("unexpected server code from %v: got %v, wanted %v", name, code, e.ErrorCode) } // Double-check we always get a ServerError, although // we don't really care that much. if !f.TestingGateway { if _, ok := err.(*tabletconn.ServerError); !ok { t.Errorf("error wasn't a tabletconn.ServerError for %v?", name) continue } } // and last we check we preserve the text, with the right prefix if !strings.Contains(err.Error(), e.Prefix()+e.Message) { t.Errorf("client error message '%v' for %v doesn't contain expected server text message '%v'", err.Error(), name, e.Prefix()+e.Message) } } f.TabletError = nil }
// VtGateErrorToVtRPCError converts a vtgate error into a vtrpc error. // TODO(aaijazi): rename this guy, and correct the usage of it everywhere. As it's currently used, // it will almost never return the correct error code, as it's only getting executeErr and reply.Error. // It should actually just use reply.Err. func VtGateErrorToVtRPCError(err error, errString string) *vtrpc.RPCError { if err == nil && errString == "" { return nil } message := "" if err != nil { message = err.Error() } else { message = errString } return &vtrpc.RPCError{ Code: vterrors.RecoverVtErrorCode(err), Message: message, } }
// NewShardError returns a ShardError which preserves the original // error code if possible, adds the connection context and adds a bit // to determine whether the keyspace/shard needs to be re-resolved for // a potential sharding event (namely, if we were in a transaction). func NewShardError(in error, keyspace, shard string, tabletType topodatapb.TabletType, tablet *topodatapb.Tablet, inTransaction bool) error { if in == nil { return nil } var shardIdentifier string if tablet != nil { shardIdentifier = fmt.Sprintf("%s.%s.%s, %+v", keyspace, shard, topoproto.TabletTypeLString(tabletType), tablet) } else { shardIdentifier = fmt.Sprintf("%s.%s.%s", keyspace, shard, topoproto.TabletTypeLString(tabletType)) } return &ShardError{ ShardIdentifier: shardIdentifier, InTransaction: inTransaction, Err: in, ErrorCode: vterrors.RecoverVtErrorCode(in), } }
// WrapError returns ShardConnError which preserves the original error code if possible, // adds the connection context // and adds a bit to determine whether the keyspace/shard needs to be // re-resolved for a potential sharding event. func WrapError(in error, keyspace, shard string, tabletType pbt.TabletType, endPoint *pbt.EndPoint, inTransaction bool) (wrapped error) { if in == nil { return nil } shardIdentifier := fmt.Sprintf("%s.%s.%s, %+v", keyspace, shard, strings.ToLower(tabletType.String()), endPoint) code := tabletconn.ERR_NORMAL serverError, ok := in.(*tabletconn.ServerError) if ok { code = serverError.Code } shardConnErr := &ShardConnError{ Code: code, ShardIdentifier: shardIdentifier, InTransaction: inTransaction, Err: in, EndPointCode: vterrors.RecoverVtErrorCode(in), } return shardConnErr }
func handleExecuteError(err error, statsKey []string, query map[string]interface{}, logger *logutil.ThrottledLogger) error { // First we log in the right category. ec := vterrors.RecoverVtErrorCode(err) switch ec { case vtrpcpb.ErrorCode_INTEGRITY_ERROR: // Duplicate key error, no need to log. infoErrors.Add("DupKey", 1) case vtrpcpb.ErrorCode_RESOURCE_EXHAUSTED, vtrpcpb.ErrorCode_BAD_INPUT: // Tx pool full error, or bad input, no need to log. normalErrors.Add(statsKey, 1) default: // Regular error, we will log if caused by vtgate. normalErrors.Add(statsKey, 1) logError(err, query, logger) } // Then we suffix the error with our address. s := fmt.Sprintf(", vtgate: %v", servenv.ListeningURL.String()) return vterrors.WithSuffix(err, s) }
// verifyErrorCode checks the error code for an error func verifyErrorCode(t *testing.T, err error, wantCode vtrpc.ErrorCode) { code := vterrors.RecoverVtErrorCode(err) if err == nil || code != wantCode { t.Errorf("vterrors.RecoverVtErrorCode(%v) => %v, want %v", err, code, wantCode) } }
// commandErrorsBecauseBusy tests that concurrent commands are rejected with // TRANSIENT_ERROR while a command is already running. // It also tests the correct propagation of the CANCELED error code. func commandErrorsBecauseBusy(t *testing.T, client vtworkerclient.Client, serverSideCancelation bool) { // Run the vtworker "Block" command which blocks until we cancel the context. var wg sync.WaitGroup ctx, cancel := context.WithCancel(context.Background()) // blockCommandStarted will be closed after we're sure that vtworker is // running the "Block" command. blockCommandStarted := make(chan struct{}) var errorCodeCheck error wg.Add(1) go func() { stream, err := client.ExecuteVtworkerCommand(ctx, []string{"Block"}) if err != nil { t.Fatalf("Block command should not have failed: %v", err) } firstLineReceived := false for { if _, err := stream.Recv(); err != nil { // We see CANCELED from the RPC client (client side cancelation) or // from vtworker itself (server side cancelation). if vterrors.RecoverVtErrorCode(err) != vtrpcpb.ErrorCode_CANCELLED { errorCodeCheck = fmt.Errorf("Block command should only error due to canceled context: %v", err) } // Stream has finished. break } if !firstLineReceived { firstLineReceived = true // The first log line will come from the "Block" command, so we are sure // now that vtworker is actually executing it. close(blockCommandStarted) } } wg.Done() }() // Try to run a second, concurrent vtworker command. // vtworker should send an error back that it's busy and we should retry later. <-blockCommandStarted gotErr := runVtworkerCommand(client, []string{"Ping", "Are you busy?"}) wantCode := vtrpcpb.ErrorCode_TRANSIENT_ERROR if gotCode := vterrors.RecoverVtErrorCode(gotErr); gotCode != wantCode { t.Fatalf("wrong error code for second cmd: got = %v, want = %v, err: %v", gotCode, wantCode, gotErr) } // Cancel running "Block" command. if serverSideCancelation { if err := runVtworkerCommand(client, []string{"Cancel"}); err != nil { t.Fatal(err) } } // Always cancel the context to not leak it (regardless of client or server // side cancelation). cancel() wg.Wait() if errorCodeCheck != nil { t.Fatalf("Block command did not return the CANCELED error code: %v", errorCodeCheck) } // vtworker is now in a special state where the current job is already // canceled but not reset yet. New commands are still failing with a // retryable error. gotErr2 := runVtworkerCommand(client, []string{"Ping", "canceled and still busy?"}) wantCode2 := vtrpcpb.ErrorCode_TRANSIENT_ERROR if gotCode2 := vterrors.RecoverVtErrorCode(gotErr2); gotCode2 != wantCode2 { t.Fatalf("wrong error code for second cmd before reset: got = %v, want = %v, err: %v", gotCode2, wantCode2, gotErr2) } // Reset vtworker for the next test function. if err := resetVtworker(t, client); err != nil { t.Fatal(err) } // Second vtworker command should succeed now after the first has finished. if err := runVtworkerCommand(client, []string{"Ping", "You should not be busy anymore!"}); err != nil { t.Fatalf("second cmd should not have failed: %v", err) } // Reset vtworker for the next test function. if err := runVtworkerCommand(client, []string{"Reset"}); err != nil { t.Fatal(err) } }