func BenchmarkTokenizeEnglishText(b *testing.B) { tokenizer := character.NewCharacterTokenizer(notSpace) b.ResetTimer() for i := 0; i < b.N; i++ { tokenizer.Tokenize(sampleLargeInput) } }
func TokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) { return character.NewCharacterTokenizer(notSpace), nil }
func TestBoundary(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ { []byte("Hello World."), analysis.TokenStream{ { Start: 0, End: 5, Term: []byte("Hello"), Position: 1, Type: analysis.AlphaNumeric, }, { Start: 6, End: 12, Term: []byte("World."), Position: 2, Type: analysis.AlphaNumeric, }, }, }, { []byte("こんにちは世界"), analysis.TokenStream{ { Start: 0, End: 21, Term: []byte("こんにちは世界"), Position: 1, Type: analysis.AlphaNumeric, }, }, }, { []byte(""), analysis.TokenStream{}, }, { []byte("abc界"), analysis.TokenStream{ { Start: 0, End: 6, Term: []byte("abc界"), Position: 1, Type: analysis.AlphaNumeric, }, }, }, } for _, test := range tests { tokenizer := character.NewCharacterTokenizer(notSpace) actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input)) } } }