func BenchmarkBatch(b *testing.B) { cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name) if err != nil { b.Fatal(err) } analysisQueue := index.NewAnalysisQueue(1) idx, err := NewUpsideDownCouch(null.Name, nil, analysisQueue) if err != nil { b.Fatal(err) } err = idx.Open() if err != nil { b.Fatal(err) } batch := index.NewBatch() for i := 0; i < 100; i++ { d := document.NewDocument(strconv.Itoa(i)) f := document.NewTextFieldWithAnalyzer("desc", nil, bleveWikiArticle1K, analyzer) d.AddField(f) batch.Update(d) } b.ResetTimer() for i := 0; i < b.N; i++ { err = idx.Batch(batch) if err != nil { b.Fatal(err) } } }
func TestFrenchElision(t *testing.T) { tests := []struct { input analysis.TokenStream output analysis.TokenStream }{ { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("l'avion"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("avion"), }, }, }, } cache := registry.NewCache() elisionFilter, err := cache.TokenFilterNamed(ElisionName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := elisionFilter.Filter(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) } } }
func BenchmarkAnalyze(b *testing.B) { cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name) if err != nil { b.Fatal(err) } analysisQueue := index.NewAnalysisQueue(1) idx, err := NewFirestorm(null.Name, nil, analysisQueue) if err != nil { b.Fatal(err) } d := document.NewDocument("1") f := document.NewTextFieldWithAnalyzer("desc", nil, bleveWikiArticle1K, analyzer) d.AddField(f) b.ResetTimer() for i := 0; i < b.N; i++ { rv := idx.Analyze(d) if len(rv.Rows) < 92 || len(rv.Rows) > 93 { b.Fatalf("expected 512-13 rows, got %d", len(rv.Rows)) } } }
func TestElisionFilter(t *testing.T) { tests := []struct { input analysis.TokenStream output analysis.TokenStream }{ { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("ar" + string(Apostrophe) + "word"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("word"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("ar" + string(RightSingleQuotationMark) + "word"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("word"), }, }, }, } cache := registry.NewCache() articleListConfig := map[string]interface{}{ "type": token_map.Name, "tokens": []interface{}{"ar"}, } _, err := cache.DefineTokenMap("articles_test", articleListConfig) if err != nil { t.Fatal(err) } elisionConfig := map[string]interface{}{ "type": "elision", "articles_token_map": "articles_test", } elisionFilter, err := cache.DefineTokenFilter("elision_test", elisionConfig) if err != nil { t.Fatal(err) } for _, test := range tests { actual := elisionFilter.Filter(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) } } }
func CommonBenchmarkIndexBatch(b *testing.B, storeName string, storeConfig map[string]interface{}, destroy KVStoreDestroy, analysisWorkers, batchSize int) { cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed("standard") if err != nil { b.Fatal(err) } b.ResetTimer() b.StopTimer() for i := 0; i < b.N; i++ { analysisQueue := index.NewAnalysisQueue(analysisWorkers) idx, err := NewUpsideDownCouch(storeName, storeConfig, analysisQueue) if err != nil { b.Fatal(err) } err = idx.Open() if err != nil { b.Fatal(err) } b.StartTimer() batch := index.NewBatch() for j := 0; j < 1000; j++ { if j%batchSize == 0 { if len(batch.IndexOps) > 0 { err := idx.Batch(batch) if err != nil { b.Fatal(err) } } batch = index.NewBatch() } indexDocument := document.NewDocument(""). AddField(document.NewTextFieldWithAnalyzer("body", []uint64{}, []byte(benchmarkDocBodies[j%10]), analyzer)) indexDocument.ID = strconv.Itoa(i) + "-" + strconv.Itoa(j) batch.Update(indexDocument) } // close last batch if len(batch.IndexOps) > 0 { err := idx.Batch(batch) if err != nil { b.Fatal(err) } } b.StopTimer() err = idx.Close() if err != nil { b.Fatal(err) } err = destroy() if err != nil { b.Fatal(err) } analysisQueue.Close() } }
func TestStopWordsFilterLongestMatch(t *testing.T) { inputTokenStream := analysis.TokenStream{ &analysis.Token{ Term: []byte("softestball"), Start: 0, End: 11, Position: 1, }, } expectedTokenStream := analysis.TokenStream{ &analysis.Token{ Term: []byte("softestball"), Start: 0, End: 11, Position: 1, }, &analysis.Token{ Term: []byte("softest"), Start: 0, End: 7, Position: 1, }, &analysis.Token{ Term: []byte("ball"), Start: 7, End: 11, Position: 1, }, } cache := registry.NewCache() dictListConfig := map[string]interface{}{ "type": token_map.Name, "tokens": []interface{}{"soft", "softest", "ball"}, } _, err := cache.DefineTokenMap("dict_test", dictListConfig) if err != nil { t.Fatal(err) } dictConfig := map[string]interface{}{ "type": "dict_compound", "dict_token_map": "dict_test", "only_longest_match": true, } dictFilter, err := cache.DefineTokenFilter("dict_test", dictConfig) if err != nil { t.Fatal(err) } ouputTokenStream := dictFilter.Filter(inputTokenStream) if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) { t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream) } }
func TestStopWordsFilter(t *testing.T) { inputTokenStream := analysis.TokenStream{ &analysis.Token{ Term: []byte("a"), }, &analysis.Token{ Term: []byte("walk"), }, &analysis.Token{ Term: []byte("in"), }, &analysis.Token{ Term: []byte("the"), }, &analysis.Token{ Term: []byte("park"), }, } expectedTokenStream := analysis.TokenStream{ &analysis.Token{ Term: []byte("walk"), }, &analysis.Token{ Term: []byte("park"), }, } cache := registry.NewCache() stopListConfig := map[string]interface{}{ "type": token_map.Name, "tokens": []interface{}{"a", "in", "the"}, } _, err := cache.DefineTokenMap("stop_test", stopListConfig) if err != nil { t.Fatal(err) } stopConfig := map[string]interface{}{ "type": "stop_tokens", "stop_token_map": "stop_test", } stopFilter, err := cache.DefineTokenFilter("stop_test", stopConfig) if err != nil { t.Fatal(err) } ouputTokenStream := stopFilter.Filter(inputTokenStream) if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) { t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream) } }
func TestSoraniAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ // stop word removal { input: []byte("ئەم پیاوە"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("پیاو"), Position: 2, Start: 7, End: 17, }, }, }, { input: []byte("پیاوە"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("پیاو"), Position: 1, Start: 0, End: 10, }, }, }, { input: []byte("پیاو"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("پیاو"), Position: 1, Start: 0, End: 8, }, }, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } } }
// NewIndexMapping creates a new IndexMapping that will use all the default indexing rules func NewIndexMapping() *IndexMapping { return &IndexMapping{ TypeMapping: make(map[string]*DocumentMapping), DefaultMapping: NewDocumentMapping(), TypeField: defaultTypeField, DefaultType: defaultType, DefaultAnalyzer: defaultAnalyzer, DefaultDateTimeParser: defaultDateTimeParser, DefaultField: defaultField, ByteArrayConverter: defaultByteArrayConverter, CustomAnalysis: newCustomAnalysis(), cache: registry.NewCache(), } }
func TestJaAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ { input: []byte("こんにちは世界"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("こんにちは"), Type: analysis.Ideographic, Position: 1, Start: 0, End: 15, }, &analysis.Token{ Term: []byte("世界"), Type: analysis.Ideographic, Position: 2, Start: 15, End: 21, }, }, }, { input: []byte("カタカナ"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("カタカナ"), Type: analysis.Ideographic, Position: 1, Start: 0, End: 12, }, }, }, } cache := registry.NewCache() for _, test := range tests { analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } actual := analyzer.Analyze(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } } }
func BenchmarkAnalysis(b *testing.B) { for i := 0; i < b.N; i++ { cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name) if err != nil { b.Fatal(err) } ts := analyzer.Analyze(bleveWikiArticle) freqs := analysis.TokenFrequency(ts, nil, true) if len(freqs) != 511 { b.Errorf("expected %d freqs, got %d", 511, len(freqs)) } } }
func TestPortugueseAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ // stemming { input: []byte("quilométricas"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("quilometric"), }, }, }, { input: []byte("quilométricos"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("quilometric"), }, }, }, // stop word { input: []byte("não"), output: analysis.TokenStream{}, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } for i, tok := range actual { if !reflect.DeepEqual(tok.Term, test.output[i].Term) { t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) } } } }
func CommonBenchmarkIndex(b *testing.B, storeName string, storeConfig map[string]interface{}, destroy KVStoreDestroy, analysisWorkers int) { cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed("standard") if err != nil { b.Fatal(err) } indexDocument := document.NewDocument(""). AddField(document.NewTextFieldWithAnalyzer("body", []uint64{}, []byte(benchmarkDocBodies[0]), analyzer)) b.ResetTimer() b.StopTimer() for i := 0; i < b.N; i++ { analysisQueue := index.NewAnalysisQueue(analysisWorkers) idx, err := NewUpsideDownCouch(storeName, storeConfig, analysisQueue) if err != nil { b.Fatal(err) } err = idx.Open() if err != nil { b.Fatal(err) } indexDocument.ID = strconv.Itoa(i) // just time the indexing portion b.StartTimer() err = idx.Update(indexDocument) if err != nil { b.Fatal(err) } b.StopTimer() err = idx.Close() if err != nil { b.Fatal(err) } err = destroy() if err != nil { b.Fatal(err) } analysisQueue.Close() } }
func TestItalianLightStemmer(t *testing.T) { tests := []struct { input analysis.TokenStream output analysis.TokenStream }{ { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("ragazzo"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("ragazz"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("ragazzi"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("ragazz"), }, }, }, } cache := registry.NewCache() filter, err := cache.TokenFilterNamed(LightStemmerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := filter.Filter(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) } } }
func TestHindiAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ // two ways to write 'hindi' itself { input: []byte("हिन्दी"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("हिंद"), Position: 1, Start: 0, End: 18, }, }, }, { input: []byte("हिंदी"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("हिंद"), Position: 1, Start: 0, End: 15, }, }, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } } }
func TestCJKAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ { input: []byte("こんにちは世界"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("こん"), Type: analysis.Double, Position: 1, Start: 0, End: 6, }, &analysis.Token{ Term: []byte("んに"), Type: analysis.Double, Position: 2, Start: 3, End: 9, }, &analysis.Token{ Term: []byte("にち"), Type: analysis.Double, Position: 3, Start: 6, End: 12, }, &analysis.Token{ Term: []byte("ちは"), Type: analysis.Double, Position: 4, Start: 9, End: 15, }, &analysis.Token{ Term: []byte("は世"), Type: analysis.Double, Position: 5, Start: 12, End: 18, }, &analysis.Token{ Term: []byte("世界"), Type: analysis.Double, Position: 6, Start: 15, End: 21, }, }, }, { input: []byte("一二三四五六七八九十"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("一二"), Type: analysis.Double, Position: 1, Start: 0, End: 6, }, &analysis.Token{ Term: []byte("二三"), Type: analysis.Double, Position: 2, Start: 3, End: 9, }, &analysis.Token{ Term: []byte("三四"), Type: analysis.Double, Position: 3, Start: 6, End: 12, }, &analysis.Token{ Term: []byte("四五"), Type: analysis.Double, Position: 4, Start: 9, End: 15, }, &analysis.Token{ Term: []byte("五六"), Type: analysis.Double, Position: 5, Start: 12, End: 18, }, &analysis.Token{ Term: []byte("六七"), Type: analysis.Double, Position: 6, Start: 15, End: 21, }, &analysis.Token{ Term: []byte("七八"), Type: analysis.Double, Position: 7, Start: 18, End: 24, }, &analysis.Token{ Term: []byte("八九"), Type: analysis.Double, Position: 8, Start: 21, End: 27, }, &analysis.Token{ Term: []byte("九十"), Type: analysis.Double, Position: 9, Start: 24, End: 30, }, }, }, { input: []byte("一 二三四 五六七八九 十"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("一"), Type: analysis.Single, Position: 1, Start: 0, End: 3, }, &analysis.Token{ Term: []byte("二三"), Type: analysis.Double, Position: 2, Start: 4, End: 10, }, &analysis.Token{ Term: []byte("三四"), Type: analysis.Double, Position: 3, Start: 7, End: 13, }, &analysis.Token{ Term: []byte("五六"), Type: analysis.Double, Position: 5, Start: 14, End: 20, }, &analysis.Token{ Term: []byte("六七"), Type: analysis.Double, Position: 6, Start: 17, End: 23, }, &analysis.Token{ Term: []byte("七八"), Type: analysis.Double, Position: 7, Start: 20, End: 26, }, &analysis.Token{ Term: []byte("八九"), Type: analysis.Double, Position: 8, Start: 23, End: 29, }, &analysis.Token{ Term: []byte("十"), Type: analysis.Single, Position: 10, Start: 30, End: 33, }, }, }, { input: []byte("abc defgh ijklmn opqrstu vwxy z"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("abc"), Type: analysis.AlphaNumeric, Position: 1, Start: 0, End: 3, }, &analysis.Token{ Term: []byte("defgh"), Type: analysis.AlphaNumeric, Position: 2, Start: 4, End: 9, }, &analysis.Token{ Term: []byte("ijklmn"), Type: analysis.AlphaNumeric, Position: 3, Start: 10, End: 16, }, &analysis.Token{ Term: []byte("opqrstu"), Type: analysis.AlphaNumeric, Position: 4, Start: 17, End: 24, }, &analysis.Token{ Term: []byte("vwxy"), Type: analysis.AlphaNumeric, Position: 5, Start: 25, End: 29, }, &analysis.Token{ Term: []byte("z"), Type: analysis.AlphaNumeric, Position: 6, Start: 30, End: 31, }, }, }, { input: []byte("あい"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("あい"), Type: analysis.Double, Position: 1, Start: 0, End: 6, }, }, }, { input: []byte("あい "), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("あい"), Type: analysis.Double, Position: 1, Start: 0, End: 6, }, }, }, { input: []byte("test"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("test"), Type: analysis.AlphaNumeric, Position: 1, Start: 0, End: 4, }, }, }, { input: []byte("test "), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("test"), Type: analysis.AlphaNumeric, Position: 1, Start: 0, End: 4, }, }, }, { input: []byte("あいtest"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("あい"), Type: analysis.Double, Position: 1, Start: 0, End: 6, }, &analysis.Token{ Term: []byte("test"), Type: analysis.AlphaNumeric, Position: 3, Start: 6, End: 10, }, }, }, { input: []byte("testあい "), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("test"), Type: analysis.AlphaNumeric, Position: 1, Start: 0, End: 4, }, &analysis.Token{ Term: []byte("あい"), Type: analysis.Double, Position: 2, Start: 4, End: 10, }, }, }, { input: []byte("あいうえおabcかきくけこ"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("あい"), Type: analysis.Double, Position: 1, Start: 0, End: 6, }, &analysis.Token{ Term: []byte("いう"), Type: analysis.Double, Position: 2, Start: 3, End: 9, }, &analysis.Token{ Term: []byte("うえ"), Type: analysis.Double, Position: 3, Start: 6, End: 12, }, &analysis.Token{ Term: []byte("えお"), Type: analysis.Double, Position: 4, Start: 9, End: 15, }, &analysis.Token{ Term: []byte("abc"), Type: analysis.AlphaNumeric, Position: 6, Start: 15, End: 18, }, &analysis.Token{ Term: []byte("かき"), Type: analysis.Double, Position: 7, Start: 18, End: 24, }, &analysis.Token{ Term: []byte("きく"), Type: analysis.Double, Position: 8, Start: 21, End: 27, }, &analysis.Token{ Term: []byte("くけ"), Type: analysis.Double, Position: 9, Start: 24, End: 30, }, &analysis.Token{ Term: []byte("けこ"), Type: analysis.Double, Position: 10, Start: 27, End: 33, }, }, }, { input: []byte("あいうえおabんcかきくけ こ"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("あい"), Type: analysis.Double, Position: 1, Start: 0, End: 6, }, &analysis.Token{ Term: []byte("いう"), Type: analysis.Double, Position: 2, Start: 3, End: 9, }, &analysis.Token{ Term: []byte("うえ"), Type: analysis.Double, Position: 3, Start: 6, End: 12, }, &analysis.Token{ Term: []byte("えお"), Type: analysis.Double, Position: 4, Start: 9, End: 15, }, &analysis.Token{ Term: []byte("ab"), Type: analysis.AlphaNumeric, Position: 6, Start: 15, End: 17, }, &analysis.Token{ Term: []byte("ん"), Type: analysis.Single, Position: 7, Start: 17, End: 20, }, &analysis.Token{ Term: []byte("c"), Type: analysis.AlphaNumeric, Position: 8, Start: 20, End: 21, }, &analysis.Token{ Term: []byte("かき"), Type: analysis.Double, Position: 9, Start: 21, End: 27, }, &analysis.Token{ Term: []byte("きく"), Type: analysis.Double, Position: 10, Start: 24, End: 30, }, &analysis.Token{ Term: []byte("くけ"), Type: analysis.Double, Position: 11, Start: 27, End: 33, }, &analysis.Token{ Term: []byte("こ"), Type: analysis.Single, Position: 13, Start: 34, End: 37, }, }, }, { input: []byte("一 روبرت موير"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("一"), Type: analysis.Single, Position: 1, Start: 0, End: 3, }, &analysis.Token{ Term: []byte("روبرت"), Type: analysis.AlphaNumeric, Position: 2, Start: 4, End: 14, }, &analysis.Token{ Term: []byte("موير"), Type: analysis.AlphaNumeric, Position: 3, Start: 15, End: 23, }, }, }, { input: []byte("一 رُوبرت موير"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("一"), Type: analysis.Single, Position: 1, Start: 0, End: 3, }, &analysis.Token{ Term: []byte("رُوبرت"), Type: analysis.AlphaNumeric, Position: 2, Start: 4, End: 16, }, &analysis.Token{ Term: []byte("موير"), Type: analysis.AlphaNumeric, Position: 3, Start: 17, End: 25, }, }, }, { input: []byte("𩬅艱鍟䇹愯瀛"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("𩬅艱"), Type: analysis.Double, Position: 1, Start: 0, End: 7, }, &analysis.Token{ Term: []byte("艱鍟"), Type: analysis.Double, Position: 2, Start: 4, End: 10, }, &analysis.Token{ Term: []byte("鍟䇹"), Type: analysis.Double, Position: 3, Start: 7, End: 13, }, &analysis.Token{ Term: []byte("䇹愯"), Type: analysis.Double, Position: 4, Start: 10, End: 16, }, &analysis.Token{ Term: []byte("愯瀛"), Type: analysis.Double, Position: 5, Start: 13, End: 19, }, }, }, { input: []byte("一"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("一"), Type: analysis.Single, Position: 1, Start: 0, End: 3, }, }, }, { input: []byte("一丁丂"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("一丁"), Type: analysis.Double, Position: 1, Start: 0, End: 6, }, &analysis.Token{ Term: []byte("丁丂"), Type: analysis.Double, Position: 2, Start: 3, End: 9, }, }, }, } cache := registry.NewCache() for _, test := range tests { analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } actual := analyzer.Analyze(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } } }
func TestPersianAnalyzerOthers(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ // nouns { input: []byte("برگ ها"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("برگ"), }, }, }, { input: []byte("برگها"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("برگ"), }, }, }, // non persian { input: []byte("English test."), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("english"), }, &analysis.Token{ Term: []byte("test"), }, }, }, // others { input: []byte("خورده مي شده بوده باشد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, { input: []byte("برگها"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("برگ"), }, }, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } for i, tok := range actual { if !reflect.DeepEqual(tok.Term, test.output[i].Term) { t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) } } } }
func TestPersianAnalyzerVerbsDefective(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ // active present indicative { input: []byte("مي خورد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورد"), }, }, }, // active preterite indicative { input: []byte("خورد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورد"), }, }, }, // active imperfective preterite indicative { input: []byte("مي خورد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورد"), }, }, }, // active future indicative { input: []byte("خواهد خورد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورد"), }, }, }, // active present progressive indicative { input: []byte("دارد مي خورد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورد"), }, }, }, // active preterite progressive indicative { input: []byte("داشت مي خورد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورد"), }, }, }, // active perfect indicative { input: []byte("خورده است"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // active imperfective perfect indicative { input: []byte("مي خورده است"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // active pluperfect indicative { input: []byte("خورده بود"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // active imperfective pluperfect indicative { input: []byte("مي خورده بود"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // active preterite subjunctive { input: []byte("خورده باشد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // active imperfective preterite subjunctive { input: []byte("مي خورده باشد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // active pluperfect subjunctive { input: []byte("خورده بوده باشد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // active imperfective pluperfect subjunctive { input: []byte("مي خورده بوده باشد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // passive present indicative { input: []byte("خورده مي شود"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // passive preterite indicative { input: []byte("خورده شد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // passive imperfective preterite indicative { input: []byte("خورده مي شد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // passive perfect indicative { input: []byte("خورده شده است"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // passive imperfective perfect indicative { input: []byte("خورده مي شده است"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // passive pluperfect indicative { input: []byte("خورده شده بود"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // passive imperfective pluperfect indicative { input: []byte("خورده مي شده بود"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // passive future indicative { input: []byte("خورده خواهد شد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // passive present progressive indicative { input: []byte("دارد خورده مي شود"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // passive preterite progressive indicative { input: []byte("داشت خورده مي شد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // passive present subjunctive { input: []byte("خورده شود"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // passive preterite subjunctive { input: []byte("خورده شده باشد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // passive imperfective preterite subjunctive { input: []byte("خورده مي شده باشد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // passive pluperfect subjunctive { input: []byte("خورده شده بوده باشد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // passive imperfective pluperfect subjunctive { input: []byte("خورده مي شده بوده باشد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("خورده"), }, }, }, // active present subjunctive { input: []byte("بخورد"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("بخورد"), }, }, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } for i, tok := range actual { if !reflect.DeepEqual(tok.Term, test.output[i].Term) { t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) } } } }
func TestExceptionsTokenizer(t *testing.T) { tests := []struct { config map[string]interface{} input []byte patterns []string result analysis.TokenStream }{ { input: []byte("test http://blevesearch.com/ words"), config: map[string]interface{}{ "type": "exception", "tokenizer": "unicode", "exceptions": []interface{}{ `[hH][tT][tT][pP][sS]?://(\S)*`, `[fF][iI][lL][eE]://(\S)*`, `[fF][tT][pP]://(\S)*`, }, }, result: analysis.TokenStream{ &analysis.Token{ Term: []byte("test"), Position: 1, Start: 0, End: 4, }, &analysis.Token{ Term: []byte("http://blevesearch.com/"), Position: 2, Start: 5, End: 28, }, &analysis.Token{ Term: []byte("words"), Position: 3, Start: 29, End: 34, }, }, }, { input: []byte("what ftp://blevesearch.com/ songs"), config: map[string]interface{}{ "type": "exception", "tokenizer": "unicode", "exceptions": []interface{}{ `[hH][tT][tT][pP][sS]?://(\S)*`, `[fF][iI][lL][eE]://(\S)*`, `[fF][tT][pP]://(\S)*`, }, }, result: analysis.TokenStream{ &analysis.Token{ Term: []byte("what"), Position: 1, Start: 0, End: 4, }, &analysis.Token{ Term: []byte("ftp://blevesearch.com/"), Position: 2, Start: 5, End: 27, }, &analysis.Token{ Term: []byte("songs"), Position: 3, Start: 28, End: 33, }, }, }, { input: []byte("please email [email protected] the URL https://blevesearch.com/"), config: map[string]interface{}{ "type": "exception", "tokenizer": "unicode", "exceptions": []interface{}{ `[hH][tT][tT][pP][sS]?://(\S)*`, `[fF][iI][lL][eE]://(\S)*`, `[fF][tT][pP]://(\S)*`, `\S+@\S+`, }, }, result: analysis.TokenStream{ &analysis.Token{ Term: []byte("please"), Position: 1, Start: 0, End: 6, }, &analysis.Token{ Term: []byte("email"), Position: 2, Start: 7, End: 12, }, &analysis.Token{ Term: []byte("*****@*****.**"), Position: 3, Start: 13, End: 32, }, &analysis.Token{ Term: []byte("the"), Position: 4, Start: 33, End: 36, }, &analysis.Token{ Term: []byte("URL"), Position: 5, Start: 37, End: 40, }, &analysis.Token{ Term: []byte("https://blevesearch.com/"), Position: 6, Start: 41, End: 65, }, }, }, } // remaining := unicode.NewUnicodeTokenizer() for _, test := range tests { // build the requested exception tokenizer cache := registry.NewCache() tokenizer, err := cache.DefineTokenizer("custom", test.config) if err != nil { t.Fatal(err) } // pattern := strings.Join(test.patterns, "|") // r, err := regexp.Compile(pattern) // if err != nil { // t.Fatal(err) // } // tokenizer := NewExceptionsTokenizer(r, remaining) actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.result) { t.Errorf("expected %v, got %v", test.result, actual) } } }
// UnmarshalJSON deserializes a JSON representation of the IndexMapping func (im *IndexMapping) UnmarshalJSON(data []byte) error { var tmp struct { TypeMapping map[string]*DocumentMapping `json:"types"` DefaultMapping *DocumentMapping `json:"default_mapping"` TypeField string `json:"type_field"` DefaultType string `json:"default_type"` DefaultAnalyzer string `json:"default_analyzer"` DefaultDateTimeParser string `json:"default_datetime_parser"` DefaultField string `json:"default_field"` ByteArrayConverter string `json:"byte_array_converter"` CustomAnalysis *customAnalysis `json:"analysis"` } err := json.Unmarshal(data, &tmp) if err != nil { return err } im.cache = registry.NewCache() im.CustomAnalysis = newCustomAnalysis() if tmp.CustomAnalysis != nil { if tmp.CustomAnalysis.CharFilters != nil { im.CustomAnalysis.CharFilters = tmp.CustomAnalysis.CharFilters } if tmp.CustomAnalysis.Tokenizers != nil { im.CustomAnalysis.Tokenizers = tmp.CustomAnalysis.Tokenizers } if tmp.CustomAnalysis.TokenMaps != nil { im.CustomAnalysis.TokenMaps = tmp.CustomAnalysis.TokenMaps } if tmp.CustomAnalysis.TokenFilters != nil { im.CustomAnalysis.TokenFilters = tmp.CustomAnalysis.TokenFilters } if tmp.CustomAnalysis.Analyzers != nil { im.CustomAnalysis.Analyzers = tmp.CustomAnalysis.Analyzers } if tmp.CustomAnalysis.DateTimeParsers != nil { im.CustomAnalysis.DateTimeParsers = tmp.CustomAnalysis.DateTimeParsers } } im.TypeField = defaultTypeField if tmp.TypeField != "" { im.TypeField = tmp.TypeField } im.DefaultType = defaultType if tmp.DefaultType != "" { im.DefaultType = tmp.DefaultType } im.DefaultAnalyzer = defaultAnalyzer if tmp.DefaultAnalyzer != "" { im.DefaultAnalyzer = tmp.DefaultAnalyzer } im.DefaultDateTimeParser = defaultDateTimeParser if tmp.DefaultDateTimeParser != "" { im.DefaultDateTimeParser = tmp.DefaultDateTimeParser } im.DefaultField = defaultField if tmp.DefaultField != "" { im.DefaultField = tmp.DefaultField } im.ByteArrayConverter = defaultByteArrayConverter if tmp.ByteArrayConverter != "" { im.ByteArrayConverter = tmp.ByteArrayConverter } im.DefaultMapping = NewDocumentMapping() if tmp.DefaultMapping != nil { im.DefaultMapping = tmp.DefaultMapping } im.TypeMapping = make(map[string]*DocumentMapping, len(tmp.TypeMapping)) for typeName, typeDocMapping := range tmp.TypeMapping { im.TypeMapping[typeName] = typeDocMapping } err = im.CustomAnalysis.registerAll(im) if err != nil { return err } return nil }
func TestEnglishAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ // stemming { input: []byte("books"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("book"), Position: 1, Start: 0, End: 5, }, }, }, { input: []byte("book"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("book"), Position: 1, Start: 0, End: 4, }, }, }, // stop word removal { input: []byte("the"), output: analysis.TokenStream{}, }, // possessive removal { input: []byte("steven's"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("steven"), Position: 1, Start: 0, End: 8, }, }, }, { input: []byte("steven\u2019s"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("steven"), Position: 1, Start: 0, End: 10, }, }, }, { input: []byte("steven\uFF07s"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("steven"), Position: 1, Start: 0, End: 10, }, }, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } } }
func TestFrenchAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ { input: []byte(""), output: analysis.TokenStream{}, }, { input: []byte("chien chat cheval"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("chien"), }, &analysis.Token{ Term: []byte("chat"), }, &analysis.Token{ Term: []byte("cheval"), }, }, }, { input: []byte("chien CHAT CHEVAL"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("chien"), }, &analysis.Token{ Term: []byte("chat"), }, &analysis.Token{ Term: []byte("cheval"), }, }, }, { input: []byte(" chien ,? + = - CHAT /: > CHEVAL"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("chien"), }, &analysis.Token{ Term: []byte("chat"), }, &analysis.Token{ Term: []byte("cheval"), }, }, }, { input: []byte("chien++"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("chien"), }, }, }, { input: []byte("mot \"entreguillemet\""), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("mot"), }, &analysis.Token{ Term: []byte("entreguilemet"), }, }, }, { input: []byte("Jean-François"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("jean"), }, &analysis.Token{ Term: []byte("francoi"), }, }, }, // stop words { input: []byte("le la chien les aux chat du des à cheval"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("chien"), }, &analysis.Token{ Term: []byte("chat"), }, &analysis.Token{ Term: []byte("cheval"), }, }, }, // nouns and adjectives { input: []byte("lances chismes habitable chiste éléments captifs"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("lanc"), }, &analysis.Token{ Term: []byte("chism"), }, &analysis.Token{ Term: []byte("habitabl"), }, &analysis.Token{ Term: []byte("chist"), }, &analysis.Token{ Term: []byte("element"), }, &analysis.Token{ Term: []byte("captif"), }, }, }, // verbs { input: []byte("finissions souffrirent rugissante"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("finision"), }, &analysis.Token{ Term: []byte("soufrirent"), }, &analysis.Token{ Term: []byte("rugisant"), }, }, }, { input: []byte("C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ "), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("c3po"), }, &analysis.Token{ Term: []byte("aujourd'hui"), }, &analysis.Token{ Term: []byte("oeuf"), }, &analysis.Token{ Term: []byte("ïaöuaä"), }, &analysis.Token{ Term: []byte("anticonstitutionel"), }, &analysis.Token{ Term: []byte("java"), }, }, }, { input: []byte("propriétaire"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("proprietair"), }, }, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } for i, tok := range actual { if !reflect.DeepEqual(tok.Term, test.output[i].Term) { t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) } } } }
func TestWeb(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ { []byte("Hello [email protected]"), analysis.TokenStream{ { Start: 0, End: 5, Term: []byte("Hello"), Position: 1, Type: analysis.AlphaNumeric, }, { Start: 6, End: 26, Term: []byte("*****@*****.**"), Position: 2, Type: analysis.AlphaNumeric, }, }, }, { []byte("That http://blevesearch.com"), analysis.TokenStream{ { Start: 0, End: 4, Term: []byte("That"), Position: 1, Type: analysis.AlphaNumeric, }, { Start: 5, End: 27, Term: []byte("http://blevesearch.com"), Position: 2, Type: analysis.AlphaNumeric, }, }, }, { []byte("Hey @blevesearch"), analysis.TokenStream{ { Start: 0, End: 3, Term: []byte("Hey"), Position: 1, Type: analysis.AlphaNumeric, }, { Start: 4, End: 16, Term: []byte("@blevesearch"), Position: 2, Type: analysis.AlphaNumeric, }, }, }, { []byte("This #bleve"), analysis.TokenStream{ { Start: 0, End: 4, Term: []byte("This"), Position: 1, Type: analysis.AlphaNumeric, }, { Start: 5, End: 11, Term: []byte("#bleve"), Position: 2, Type: analysis.AlphaNumeric, }, }, }, { []byte("What about @blevesearch?"), analysis.TokenStream{ { Start: 0, End: 4, Term: []byte("What"), Position: 1, Type: analysis.AlphaNumeric, }, { Start: 5, End: 10, Term: []byte("about"), Position: 2, Type: analysis.AlphaNumeric, }, { Start: 11, End: 23, Term: []byte("@blevesearch"), Position: 3, Type: analysis.AlphaNumeric, }, }, }, } cache := registry.NewCache() tokenizer, err := cache.TokenizerNamed(Name) if err != nil { t.Fatal(err) } for _, test := range tests { actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input)) } } }
func TestArabicAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ { input: []byte("كبير"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("كبير"), Position: 1, Start: 0, End: 8, }, }, }, // feminine marker { input: []byte("كبيرة"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("كبير"), Position: 1, Start: 0, End: 10, }, }, }, { input: []byte("مشروب"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("مشروب"), Position: 1, Start: 0, End: 10, }, }, }, // plural -at { input: []byte("مشروبات"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("مشروب"), Position: 1, Start: 0, End: 14, }, }, }, // plural -in { input: []byte("أمريكيين"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("امريك"), Position: 1, Start: 0, End: 16, }, }, }, // singular with bare alif { input: []byte("امريكي"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("امريك"), Position: 1, Start: 0, End: 12, }, }, }, { input: []byte("كتاب"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("كتاب"), Position: 1, Start: 0, End: 8, }, }, }, // definite article { input: []byte("الكتاب"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("كتاب"), Position: 1, Start: 0, End: 12, }, }, }, { input: []byte("ما ملكت أيمانكم"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("ملكت"), Position: 2, Start: 5, End: 13, }, &analysis.Token{ Term: []byte("ايمانكم"), Position: 3, Start: 14, End: 28, }, }, }, // stopwords { input: []byte("الذين ملكت أيمانكم"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("ملكت"), Position: 2, Start: 11, End: 19, }, &analysis.Token{ Term: []byte("ايمانكم"), Position: 3, Start: 20, End: 34, }, }, }, // presentation form normalization { input: []byte("ﺍﻟﺴﻼﻢ"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("سلام"), Position: 1, Start: 0, End: 15, }, }, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term) } } }
func TestItalianAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ // stemming { input: []byte("abbandonata"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("abbandonat"), }, }, }, { input: []byte("abbandonati"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("abbandonat"), }, }, }, // stop word { input: []byte("dallo"), output: analysis.TokenStream{}, }, // contractions { input: []byte("dell'Italia"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("ital"), }, }, }, { input: []byte("l'Italiano"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("italian"), }, }, }, // test for bug #218 { input: []byte("Nell'anfora"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("anfor"), }, }, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } for i, tok := range actual { if !reflect.DeepEqual(tok.Term, test.output[i].Term) { t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) } } } }
func TestPortugueseLightStemmer(t *testing.T) { tests := []struct { input analysis.TokenStream output analysis.TokenStream }{ { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("doutores"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("doutor"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("doutor"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("doutor"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("homens"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("homem"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("homem"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("homem"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("papéis"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("papel"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("papel"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("papel"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("normais"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("normal"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("normal"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("normal"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("lencóis"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("lencol"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("lencol"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("lencol"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("barris"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("barril"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("barril"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("barril"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("botões"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("bota"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("botão"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("bota"), }, }, }, // longer { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("o"), }, &analysis.Token{ Term: []byte("debate"), }, &analysis.Token{ Term: []byte("político"), }, &analysis.Token{ Term: []byte("pelo"), }, &analysis.Token{ Term: []byte("menos"), }, &analysis.Token{ Term: []byte("o"), }, &analysis.Token{ Term: []byte("que"), }, &analysis.Token{ Term: []byte("vem"), }, &analysis.Token{ Term: []byte("a"), }, &analysis.Token{ Term: []byte("público"), }, &analysis.Token{ Term: []byte("parece"), }, &analysis.Token{ Term: []byte("de"), }, &analysis.Token{ Term: []byte("modo"), }, &analysis.Token{ Term: []byte("nada"), }, &analysis.Token{ Term: []byte("surpreendente"), }, &analysis.Token{ Term: []byte("restrito"), }, &analysis.Token{ Term: []byte("a"), }, &analysis.Token{ Term: []byte("temas"), }, &analysis.Token{ Term: []byte("menores"), }, &analysis.Token{ Term: []byte("mas"), }, &analysis.Token{ Term: []byte("há"), }, &analysis.Token{ Term: []byte("evidentemente"), }, &analysis.Token{ Term: []byte("grandes"), }, &analysis.Token{ Term: []byte("questões"), }, &analysis.Token{ Term: []byte("em"), }, &analysis.Token{ Term: []byte("jogo"), }, &analysis.Token{ Term: []byte("nas"), }, &analysis.Token{ Term: []byte("eleições"), }, &analysis.Token{ Term: []byte("que"), }, &analysis.Token{ Term: []byte("se"), }, &analysis.Token{ Term: []byte("aproximam"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("o"), }, &analysis.Token{ Term: []byte("debat"), }, &analysis.Token{ Term: []byte("politic"), }, &analysis.Token{ Term: []byte("pelo"), }, &analysis.Token{ Term: []byte("meno"), }, &analysis.Token{ Term: []byte("o"), }, &analysis.Token{ Term: []byte("que"), }, &analysis.Token{ Term: []byte("vem"), }, &analysis.Token{ Term: []byte("a"), }, &analysis.Token{ Term: []byte("public"), }, &analysis.Token{ Term: []byte("parec"), }, &analysis.Token{ Term: []byte("de"), }, &analysis.Token{ Term: []byte("modo"), }, &analysis.Token{ Term: []byte("nada"), }, &analysis.Token{ Term: []byte("surpreendent"), }, &analysis.Token{ Term: []byte("restrit"), }, &analysis.Token{ Term: []byte("a"), }, &analysis.Token{ Term: []byte("tema"), }, &analysis.Token{ Term: []byte("menor"), }, &analysis.Token{ Term: []byte("mas"), }, &analysis.Token{ Term: []byte("há"), }, &analysis.Token{ Term: []byte("evident"), }, &analysis.Token{ Term: []byte("grand"), }, &analysis.Token{ Term: []byte("questa"), }, &analysis.Token{ Term: []byte("em"), }, &analysis.Token{ Term: []byte("jogo"), }, &analysis.Token{ Term: []byte("nas"), }, &analysis.Token{ Term: []byte("eleica"), }, &analysis.Token{ Term: []byte("que"), }, &analysis.Token{ Term: []byte("se"), }, &analysis.Token{ Term: []byte("aproximam"), }, }, }, } cache := registry.NewCache() filter, err := cache.TokenFilterNamed(LightStemmerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := filter.Filter(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) } } }
func TestEnglishPossessiveFilter(t *testing.T) { tests := []struct { input analysis.TokenStream output analysis.TokenStream }{ { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("marty's"), }, &analysis.Token{ Term: []byte("MARTY'S"), }, &analysis.Token{ Term: []byte("marty’s"), }, &analysis.Token{ Term: []byte("MARTY’S"), }, &analysis.Token{ Term: []byte("marty's"), }, &analysis.Token{ Term: []byte("MARTY'S"), }, &analysis.Token{ Term: []byte("m"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("marty"), }, &analysis.Token{ Term: []byte("MARTY"), }, &analysis.Token{ Term: []byte("marty"), }, &analysis.Token{ Term: []byte("MARTY"), }, &analysis.Token{ Term: []byte("marty"), }, &analysis.Token{ Term: []byte("MARTY"), }, &analysis.Token{ Term: []byte("m"), }, }, }, } cache := registry.NewCache() stemmerFilter, err := cache.TokenFilterNamed(PossessiveName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := stemmerFilter.Filter(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %s, got %s", test.output, actual) } } }
func newConfiguration() *configuration { return &configuration{ Cache: registry.NewCache(), analysisQueue: index.NewAnalysisQueue(4), } }
func TestFrenchLightStemmer(t *testing.T) { tests := []struct { input analysis.TokenStream output analysis.TokenStream }{ { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("chevaux"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("cheval"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("cheval"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("cheval"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("hiboux"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("hibou"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("hibou"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("hibou"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("chantés"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("chant"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("chanter"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("chant"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("chante"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("chant"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("chant"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("chant"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("baronnes"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("baron"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("barons"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("baron"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("baron"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("baron"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("peaux"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("peau"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("peau"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("peau"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("anneaux"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("aneau"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("anneau"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("aneau"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("neveux"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("neveu"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("neveu"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("neveu"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("affreux"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("afreu"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("affreuse"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("afreu"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("investissement"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("investi"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("investir"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("investi"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("assourdissant"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("asourdi"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("assourdir"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("asourdi"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("pratiquement"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("pratiqu"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("pratique"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("pratiqu"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("administrativement"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("administratif"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("administratif"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("administratif"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("justificatrice"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("justifi"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("justificateur"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("justifi"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("justifier"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("justifi"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("educatrice"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("eduqu"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("eduquer"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("eduqu"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("communicateur"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("comuniqu"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("communiquer"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("comuniqu"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("accompagnatrice"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("acompagn"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("accompagnateur"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("acompagn"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("administrateur"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("administr"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("administrer"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("administr"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("productrice"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("product"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("producteur"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("product"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("acheteuse"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("achet"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("acheteur"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("achet"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("planteur"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("plant"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("plante"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("plant"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("poreuse"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("poreu"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("poreux"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("poreu"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("plieuse"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("plieu"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("bijoutière"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("bijouti"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("bijoutier"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("bijouti"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("caissière"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("caisi"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("caissier"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("caisi"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("abrasive"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("abrasif"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("abrasif"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("abrasif"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("folle"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("fou"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("fou"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("fou"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("personnelle"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("person"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("personne"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("person"), }, }, }, // algo bug: too short length // { // input: analysis.TokenStream{ // &analysis.Token{ // Term: []byte("personnel"), // }, // }, // output: analysis.TokenStream{ // &analysis.Token{ // Term: []byte("person"), // }, // }, // }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("complète"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("complet"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("complet"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("complet"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("aromatique"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("aromat"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("faiblesse"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("faibl"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("faible"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("faibl"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("patinage"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("patin"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("patin"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("patin"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("sonorisation"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("sono"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("ritualisation"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("rituel"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("rituel"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("rituel"), }, }, }, // algo bug: masked by rules above // { // input: analysis.TokenStream{ // &analysis.Token{ // Term: []byte("colonisateur"), // }, // }, // output: analysis.TokenStream{ // &analysis.Token{ // Term: []byte("colon"), // }, // }, // }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("nomination"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("nomin"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("disposition"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("dispos"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("dispose"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("dispos"), }, }, }, // SOLR-3463 : abusive compression of repeated characters in numbers // Trailing repeated char elision : { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("1234555"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("1234555"), }, }, }, // Repeated char within numbers with more than 4 characters : { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("12333345"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("12333345"), }, }, }, // Short numbers weren't affected already: { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("1234"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("1234"), }, }, }, // Ensure behaviour is preserved for words! // Trailing repeated char elision : { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("abcdeff"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("abcdef"), }, }, }, // Repeated char within words with more than 4 characters : { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("abcccddeef"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("abcdef"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("créées"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("cre"), }, }, }, // Combined letter and digit repetition // 10:00pm { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("22hh00"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("22h00"), }, }, }, // bug #214 { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("propriétaire"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("proprietair"), }, }, }, } cache := registry.NewCache() filter, err := cache.TokenFilterNamed(LightStemmerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := filter.Filter(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) } } }
func TestStopWordsFilter(t *testing.T) { inputTokenStream := analysis.TokenStream{ &analysis.Token{ Term: []byte("i"), Start: 0, End: 1, Position: 1, }, &analysis.Token{ Term: []byte("like"), Start: 2, End: 6, Position: 2, }, &analysis.Token{ Term: []byte("to"), Start: 7, End: 9, Position: 3, }, &analysis.Token{ Term: []byte("play"), Start: 10, End: 14, Position: 4, }, &analysis.Token{ Term: []byte("softball"), Start: 15, End: 23, Position: 5, }, } expectedTokenStream := analysis.TokenStream{ &analysis.Token{ Term: []byte("i"), Start: 0, End: 1, Position: 1, }, &analysis.Token{ Term: []byte("like"), Start: 2, End: 6, Position: 2, }, &analysis.Token{ Term: []byte("to"), Start: 7, End: 9, Position: 3, }, &analysis.Token{ Term: []byte("play"), Start: 10, End: 14, Position: 4, }, &analysis.Token{ Term: []byte("softball"), Start: 15, End: 23, Position: 5, }, &analysis.Token{ Term: []byte("soft"), Start: 15, End: 19, Position: 5, }, &analysis.Token{ Term: []byte("ball"), Start: 19, End: 23, Position: 5, }, } cache := registry.NewCache() dictListConfig := map[string]interface{}{ "type": token_map.Name, "tokens": []interface{}{"factor", "soft", "ball", "team"}, } _, err := cache.DefineTokenMap("dict_test", dictListConfig) if err != nil { t.Fatal(err) } dictConfig := map[string]interface{}{ "type": "dict_compound", "dict_token_map": "dict_test", } dictFilter, err := cache.DefineTokenFilter("dict_test", dictConfig) if err != nil { t.Fatal(err) } ouputTokenStream := dictFilter.Filter(inputTokenStream) if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) { t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream) } }