func CommonBenchmarkIndexBatch(b *testing.B, create KVStoreCreate, destroy KVStoreDestroy, analysisWorkers, batchSize int) { cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed("standard") if err != nil { b.Fatal(err) } b.ResetTimer() b.StopTimer() for i := 0; i < b.N; i++ { s, err := create() if err != nil { b.Fatal(err) } analysisQueue := index.NewAnalysisQueue(analysisWorkers) idx := NewUpsideDownCouch(s, analysisQueue) err = idx.Open() if err != nil { b.Fatal(err) } b.StartTimer() batch := index.NewBatch() for j := 0; j < 1000; j++ { if j%batchSize == 0 { if len(batch.IndexOps) > 0 { err := idx.Batch(batch) if err != nil { b.Fatal(err) } } batch = index.NewBatch() } indexDocument := document.NewDocument(""). AddField(document.NewTextFieldWithAnalyzer("body", []uint64{}, []byte(benchmarkDocBodies[j%10]), analyzer)) indexDocument.ID = strconv.Itoa(i) + "-" + strconv.Itoa(j) batch.Update(indexDocument) } // close last batch if len(batch.IndexOps) > 0 { err := idx.Batch(batch) if err != nil { b.Fatal(err) } } b.StopTimer() err = idx.Close() if err != nil { b.Fatal(err) } err = destroy() if err != nil { b.Fatal(err) } analysisQueue.Close() } }
func BenchmarkBatch(b *testing.B) { cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name) if err != nil { b.Fatal(err) } analysisQueue := index.NewAnalysisQueue(1) idx, err := NewUpsideDownCouch(null.Name, nil, analysisQueue) if err != nil { b.Fatal(err) } err = idx.Open() if err != nil { b.Fatal(err) } batch := index.NewBatch() for i := 0; i < 100; i++ { d := document.NewDocument(strconv.Itoa(i)) f := document.NewTextFieldWithAnalyzer("desc", nil, bleveWikiArticle1K, analyzer) d.AddField(f) batch.Update(d) } b.ResetTimer() for i := 0; i < b.N; i++ { err = idx.Batch(batch) if err != nil { b.Fatal(err) } } }
func TestElisionFilter(t *testing.T) { tests := []struct { input analysis.TokenStream output analysis.TokenStream }{ { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("ar" + string(Apostrophe) + "word"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("word"), }, }, }, { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("ar" + string(RightSingleQuotationMark) + "word"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("word"), }, }, }, } cache := registry.NewCache() articleListConfig := map[string]interface{}{ "type": token_map.Name, "tokens": []interface{}{"ar"}, } _, err := cache.DefineTokenMap("articles_test", articleListConfig) if err != nil { t.Fatal(err) } elisionConfig := map[string]interface{}{ "type": "elision", "articles_token_map": "articles_test", } elisionFilter, err := cache.DefineTokenFilter("elision_test", elisionConfig) if err != nil { t.Fatal(err) } for _, test := range tests { actual := elisionFilter.Filter(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) } } }
func TestItalianElision(t *testing.T) { tests := []struct { input analysis.TokenStream output analysis.TokenStream }{ { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("dell'Italia"), }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("Italia"), }, }, }, } cache := registry.NewCache() elisionFilter, err := cache.TokenFilterNamed(ElisionName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := elisionFilter.Filter(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) } } }
func BenchmarkAnalyze(b *testing.B) { cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name) if err != nil { b.Fatal(err) } analysisQueue := index.NewAnalysisQueue(1) idx, err := NewFirestorm(null.Name, nil, analysisQueue) if err != nil { b.Fatal(err) } d := document.NewDocument("1") f := document.NewTextFieldWithAnalyzer("desc", nil, bleveWikiArticle1K, analyzer) d.AddField(f) b.ResetTimer() for i := 0; i < b.N; i++ { rv := idx.Analyze(d) if len(rv.Rows) < 92 || len(rv.Rows) > 93 { b.Fatalf("expected 512-13 rows, got %d", len(rv.Rows)) } } }
func TestStopWordsFilterLongestMatch(t *testing.T) { inputTokenStream := analysis.TokenStream{ &analysis.Token{ Term: []byte("softestball"), Start: 0, End: 11, Position: 1, }, } expectedTokenStream := analysis.TokenStream{ &analysis.Token{ Term: []byte("softestball"), Start: 0, End: 11, Position: 1, }, &analysis.Token{ Term: []byte("softest"), Start: 0, End: 7, Position: 1, }, &analysis.Token{ Term: []byte("ball"), Start: 7, End: 11, Position: 1, }, } cache := registry.NewCache() dictListConfig := map[string]interface{}{ "type": token_map.Name, "tokens": []interface{}{"soft", "softest", "ball"}, } _, err := cache.DefineTokenMap("dict_test", dictListConfig) if err != nil { t.Fatal(err) } dictConfig := map[string]interface{}{ "type": "dict_compound", "dict_token_map": "dict_test", "only_longest_match": true, } dictFilter, err := cache.DefineTokenFilter("dict_test", dictConfig) if err != nil { t.Fatal(err) } ouputTokenStream := dictFilter.Filter(inputTokenStream) if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) { t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream) } }
func BenchmarkCJKAnalyzer(b *testing.B) { cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { b.Fatal(err) } for i := 0; i < b.N; i++ { analyzer.Analyze(bleveWikiArticleJapanese) } }
func TestSoraniAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ // stop word removal { input: []byte("ئەم پیاوە"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("پیاو"), Position: 2, Start: 7, End: 17, }, }, }, { input: []byte("پیاوە"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("پیاو"), Position: 1, Start: 0, End: 10, }, }, }, { input: []byte("پیاو"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("پیاو"), Position: 1, Start: 0, End: 8, }, }, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } } }
func TestStopWordsFilter(t *testing.T) { inputTokenStream := analysis.TokenStream{ &analysis.Token{ Term: []byte("a"), }, &analysis.Token{ Term: []byte("walk"), }, &analysis.Token{ Term: []byte("in"), }, &analysis.Token{ Term: []byte("the"), }, &analysis.Token{ Term: []byte("park"), }, } expectedTokenStream := analysis.TokenStream{ &analysis.Token{ Term: []byte("walk"), }, &analysis.Token{ Term: []byte("park"), }, } cache := registry.NewCache() stopListConfig := map[string]interface{}{ "type": token_map.Name, "tokens": []interface{}{"a", "in", "the"}, } _, err := cache.DefineTokenMap("stop_test", stopListConfig) if err != nil { t.Fatal(err) } stopConfig := map[string]interface{}{ "type": "stop_tokens", "stop_token_map": "stop_test", } stopFilter, err := cache.DefineTokenFilter("stop_test", stopConfig) if err != nil { t.Fatal(err) } ouputTokenStream := stopFilter.Filter(inputTokenStream) if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) { t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream) } }
func TestEnglishStemmer(t *testing.T) { tests := []struct { input analysis.TokenStream output analysis.TokenStream }{ { input: analysis.TokenStream{ &analysis.Token{ Term: []byte("walking"), }, &analysis.Token{ Term: []byte("talked"), }, &analysis.Token{ Term: []byte("business"), }, &analysis.Token{ Term: []byte("protected"), KeyWord: true, }, }, output: analysis.TokenStream{ &analysis.Token{ Term: []byte("walk"), }, &analysis.Token{ Term: []byte("talk"), }, &analysis.Token{ Term: []byte("busi"), }, &analysis.Token{ Term: []byte("protected"), KeyWord: true, }, }, }, } cache := registry.NewCache() stemmerFilter, err := cache.TokenFilterNamed(StemmerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := stemmerFilter.Filter(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %s, got %s", test.output, actual) } } }
func TestThaiAnalyzerWihtoutOffsets(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ // stop words { input: []byte("บริษัทชื่อ XY&Z - คุยกับ [email protected]"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("บริษัท"), }, &analysis.Token{ Term: []byte("ชื่อ"), }, &analysis.Token{ Term: []byte("xy"), }, &analysis.Token{ Term: []byte("z"), }, &analysis.Token{ Term: []byte("คุย"), }, &analysis.Token{ Term: []byte("xyz"), }, &analysis.Token{ Term: []byte("demo.com"), }, }, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if len(actual) != len(test.output) { t.Errorf("expected length: %d, got %d", len(test.output), len(actual)) } for i, tok := range actual { if !reflect.DeepEqual(tok.Term, test.output[i].Term) { t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) } } } }
// NewIndexMapping creates a new IndexMapping that will use all the default indexing rules func NewIndexMapping() *IndexMapping { return &IndexMapping{ TypeMapping: make(map[string]*DocumentMapping), DefaultMapping: NewDocumentMapping(), TypeField: defaultTypeField, DefaultType: defaultType, DefaultAnalyzer: defaultAnalyzer, DefaultDateTimeParser: defaultDateTimeParser, DefaultField: defaultField, ByteArrayConverter: defaultByteArrayConverter, CustomAnalysis: newCustomAnalysis(), cache: registry.NewCache(), } }
func TestJaAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ { input: []byte("こんにちは世界"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("こんにちは"), Type: analysis.Ideographic, Position: 1, Start: 0, End: 15, }, &analysis.Token{ Term: []byte("世界"), Type: analysis.Ideographic, Position: 2, Start: 15, End: 21, }, }, }, { input: []byte("カタカナ"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("カタカナ"), Type: analysis.Ideographic, Position: 1, Start: 0, End: 12, }, }, }, } cache := registry.NewCache() for _, test := range tests { analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } actual := analyzer.Analyze(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } } }
// NewIndexMapping creates a new IndexMapping that will use all the default indexing rules func NewIndexMapping() *IndexMapping { return &IndexMapping{ TypeMapping: make(map[string]*DocumentMapping), DefaultMapping: NewDocumentMapping(), TypeField: defaultTypeField, DefaultType: defaultType, DefaultAnalyzer: defaultAnalyzer, DefaultDateTimeParser: defaultDateTimeParser, DefaultField: defaultField, IndexDynamic: IndexDynamic, StoreDynamic: StoreDynamic, CustomAnalysis: newCustomAnalysis(), cache: registry.NewCache(), } }
func TestPortugueseAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ // stemming // fails due to stemming discrepencies // got quilométr instead of quilometric // { // input: []byte("quilométricas"), // output: analysis.TokenStream{ // &analysis.Token{ // Term: []byte("quilometric"), // }, // }, // }, // { // input: []byte("quilométricos"), // output: analysis.TokenStream{ // &analysis.Token{ // Term: []byte("quilometric"), // }, // }, // }, // stop word { input: []byte("não"), output: analysis.TokenStream{}, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } for i, tok := range actual { if !reflect.DeepEqual(tok.Term, test.output[i].Term) { t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) } } } }
func TestItalianAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ // stemming // fails, stemming discrepencies // abbandon intead of abbandonat // { // input: []byte("abbandonata"), // output: analysis.TokenStream{ // &analysis.Token{ // Term: []byte("abbandonat"), // }, // }, // }, // { // input: []byte("abbandonati"), // output: analysis.TokenStream{ // &analysis.Token{ // Term: []byte("abbandonat"), // }, // }, // }, // stop word { input: []byte("dallo"), output: analysis.TokenStream{}, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } for i, tok := range actual { if !reflect.DeepEqual(tok.Term, test.output[i].Term) { t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) } } } }
func BenchmarkAnalysis(b *testing.B) { for i := 0; i < b.N; i++ { cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name) if err != nil { b.Fatal(err) } ts := analyzer.Analyze(bleveWikiArticle) freqs := analysis.TokenFrequency(ts, nil, true) if len(freqs) != 511 { b.Errorf("expected %d freqs, got %d", 511, len(freqs)) } } }
func TestDanishAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ // stemming { input: []byte("undersøg"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("undersøg"), Position: 1, Start: 0, End: 9, }, }, }, { input: []byte("undersøgelse"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("undersøg"), Position: 1, Start: 0, End: 13, }, }, }, // stop word { input: []byte("på"), output: analysis.TokenStream{}, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } } }
func TestHungarianAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ // stemming { input: []byte("babakocsi"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("babakocs"), }, }, }, { input: []byte("babakocsijáért"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("babakocs"), }, }, }, // stop word { input: []byte("által"), output: analysis.TokenStream{}, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } for i, tok := range actual { if !reflect.DeepEqual(tok.Term, test.output[i].Term) { t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) } } } }
func BenchmarkStopWordsFilter(b *testing.B) { inputTokenStream := analysis.TokenStream{ &analysis.Token{ Term: []byte("a"), }, &analysis.Token{ Term: []byte("walk"), }, &analysis.Token{ Term: []byte("in"), }, &analysis.Token{ Term: []byte("the"), }, &analysis.Token{ Term: []byte("park"), }, } cache := registry.NewCache() stopListConfig := map[string]interface{}{ "type": token_map.Name, "tokens": []interface{}{"a", "in", "the"}, } _, err := cache.DefineTokenMap("stop_test", stopListConfig) if err != nil { b.Fatal(err) } stopConfig := map[string]interface{}{ "type": "stop_tokens", "stop_token_map": "stop_test", } stopFilter, err := cache.DefineTokenFilter("stop_test", stopConfig) if err != nil { b.Fatal(err) } b.ResetTimer() for i := 0; i < b.N; i++ { stopFilter.Filter(inputTokenStream) } }
func CommonBenchmarkIndex(b *testing.B, create KVStoreCreate, destroy KVStoreDestroy, analysisWorkers int) { cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed("standard") if err != nil { b.Fatal(err) } indexDocument := document.NewDocument(""). AddField(document.NewTextFieldWithAnalyzer("body", []uint64{}, []byte(benchmarkDocBodies[0]), analyzer)) b.ResetTimer() b.StopTimer() for i := 0; i < b.N; i++ { s, err := create() if err != nil { b.Fatal(err) } analysisQueue := index.NewAnalysisQueue(analysisWorkers) idx := NewUpsideDownCouch(s, analysisQueue) err = idx.Open() if err != nil { b.Fatal(err) } indexDocument.ID = strconv.Itoa(i) // just time the indexing portion b.StartTimer() err = idx.Update(indexDocument) if err != nil { b.Fatal(err) } b.StopTimer() err = idx.Close() if err != nil { b.Fatal(err) } err = destroy() if err != nil { b.Fatal(err) } analysisQueue.Close() } }
func TestThaiAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ // stop words { input: []byte("การที่ได้ต้องแสดงว่างานดี"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("แสดง"), Position: 5, Start: 39, End: 51, }, &analysis.Token{ Term: []byte("งาน"), Position: 7, Start: 60, End: 69, }, &analysis.Token{ Term: []byte("ดี"), Position: 8, Start: 69, End: 75, }, }, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } } }
func TestHindiAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ // two ways to write 'hindi' itself { input: []byte("हिन्दी"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("हिंद"), Position: 1, Start: 0, End: 18, }, }, }, { input: []byte("हिंदी"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("हिंद"), Position: 1, Start: 0, End: 15, }, }, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } } }
func CommonBenchmarkIndex(b *testing.B, s store.KVStore, analysisWorkers int) { analysisQueue := NewAnalysisQueue(analysisWorkers) idx := NewUpsideDownCouch(s, analysisQueue) cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed("standard") if err != nil { b.Fatal(err) } indexDocument := document.NewDocument(""). AddField(document.NewTextFieldWithAnalyzer("body", []uint64{}, []byte(benchmarkDocBodies[0]), analyzer)) b.ResetTimer() for i := 0; i < b.N; i++ { indexDocument.ID = strconv.Itoa(i) err := idx.Update(indexDocument) if err != nil { b.Fatal(err) } } }
func CommonBenchmarkIndexBatch(b *testing.B, s store.KVStore, analysisWorkers, batchSize int) { analysisQueue := NewAnalysisQueue(analysisWorkers) idx := NewUpsideDownCouch(s, analysisQueue) cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed("standard") if err != nil { b.Fatal(err) } b.ResetTimer() for i := 0; i < b.N; i++ { var batch index.Batch for j := 0; j < 1000; j++ { if j%batchSize == 0 { if len(batch) > 0 { err := idx.Batch(batch) if err != nil { b.Fatal(err) } } batch = make(index.Batch) } indexDocument := document.NewDocument(""). AddField(document.NewTextFieldWithAnalyzer("body", []uint64{}, []byte(benchmarkDocBodies[j%10]), analyzer)) indexDocument.ID = strconv.Itoa(i) + "-" + strconv.Itoa(j) batch[indexDocument.ID] = indexDocument } // close last batch if len(batch) > 0 { err := idx.Batch(batch) if err != nil { b.Fatal(err) } } } }
func TestAnalysisBug328(t *testing.T) { cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name) if err != nil { t.Fatal(err) } analysisQueue := index.NewAnalysisQueue(1) idx, err := NewFirestorm(gtreap.Name, nil, analysisQueue) if err != nil { t.Fatal(err) } d := document.NewDocument("1") f := document.NewTextFieldCustom("title", nil, []byte("bleve"), document.IndexField|document.IncludeTermVectors, analyzer) d.AddField(f) f = document.NewTextFieldCustom("body", nil, []byte("bleve"), document.IndexField|document.IncludeTermVectors, analyzer) d.AddField(f) cf := document.NewCompositeFieldWithIndexingOptions("_all", true, []string{}, []string{}, document.IndexField|document.IncludeTermVectors) d.AddField(cf) rv := idx.Analyze(d) fieldIndexes := make(map[uint16]string) for _, row := range rv.Rows { if row, ok := row.(*FieldRow); ok { fieldIndexes[row.index] = row.Name() } if row, ok := row.(*TermFreqRow); ok && string(row.term) == "bleve" { for _, vec := range row.Vectors() { if vec.GetField() != uint32(row.field) { if fieldIndexes[row.field] != "_all" { t.Errorf("row named %s field %d - vector field %d", fieldIndexes[row.field], row.field, vec.GetField()) } } } } } }
func BenchmarkEnglishPossessiveFilter(b *testing.B) { input := analysis.TokenStream{ &analysis.Token{ Term: []byte("marty's"), }, &analysis.Token{ Term: []byte("MARTY'S"), }, &analysis.Token{ Term: []byte("marty’s"), }, &analysis.Token{ Term: []byte("MARTY’S"), }, &analysis.Token{ Term: []byte("marty's"), }, &analysis.Token{ Term: []byte("MARTY'S"), }, &analysis.Token{ Term: []byte("m"), }, } cache := registry.NewCache() stemmerFilter, err := cache.TokenFilterNamed(PossessiveName) if err != nil { b.Fatal(err) } b.ResetTimer() for i := 0; i < b.N; i++ { stemmerFilter.Filter(input) } }
// UnmarshalJSON offers custom unmarshaling with optional strict validation func (im *IndexMapping) UnmarshalJSON(data []byte) error { var tmp map[string]json.RawMessage err := json.Unmarshal(data, &tmp) if err != nil { return err } // set defaults for fields which might have been omitted im.cache = registry.NewCache() im.CustomAnalysis = newCustomAnalysis() im.TypeField = defaultTypeField im.DefaultType = defaultType im.DefaultAnalyzer = defaultAnalyzer im.DefaultDateTimeParser = defaultDateTimeParser im.DefaultField = defaultField im.ByteArrayConverter = defaultByteArrayConverter im.DefaultMapping = NewDocumentMapping() im.TypeMapping = make(map[string]*DocumentMapping) im.StoreDynamic = StoreDynamic im.IndexDynamic = IndexDynamic var invalidKeys []string for k, v := range tmp { switch k { case "analysis": err := json.Unmarshal(v, &im.CustomAnalysis) if err != nil { return err } case "type_field": err := json.Unmarshal(v, &im.TypeField) if err != nil { return err } case "default_type": err := json.Unmarshal(v, &im.DefaultType) if err != nil { return err } case "default_analyzer": err := json.Unmarshal(v, &im.DefaultAnalyzer) if err != nil { return err } case "default_datetime_parser": err := json.Unmarshal(v, &im.DefaultDateTimeParser) if err != nil { return err } case "default_field": err := json.Unmarshal(v, &im.DefaultField) if err != nil { return err } case "byte_array_converter": err := json.Unmarshal(v, &im.ByteArrayConverter) if err != nil { return err } case "default_mapping": err := json.Unmarshal(v, &im.DefaultMapping) if err != nil { return err } case "types": err := json.Unmarshal(v, &im.TypeMapping) if err != nil { return err } case "store_dynamic": err := json.Unmarshal(v, &im.StoreDynamic) if err != nil { return err } case "index_dynamic": err := json.Unmarshal(v, &im.IndexDynamic) if err != nil { return err } default: invalidKeys = append(invalidKeys, k) } } if MappingJSONStrict && len(invalidKeys) > 0 { return fmt.Errorf("index mapping contains invalid keys: %v", invalidKeys) } err = im.CustomAnalysis.registerAll(im) if err != nil { return err } return nil }
func TestArabicAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ { input: []byte("كبير"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("كبير"), Position: 1, Start: 0, End: 8, }, }, }, // feminine marker { input: []byte("كبيرة"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("كبير"), Position: 1, Start: 0, End: 10, }, }, }, { input: []byte("مشروب"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("مشروب"), Position: 1, Start: 0, End: 10, }, }, }, // plural -at { input: []byte("مشروبات"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("مشروب"), Position: 1, Start: 0, End: 14, }, }, }, // plural -in { input: []byte("أمريكيين"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("امريك"), Position: 1, Start: 0, End: 16, }, }, }, // singular with bare alif { input: []byte("امريكي"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("امريك"), Position: 1, Start: 0, End: 12, }, }, }, { input: []byte("كتاب"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("كتاب"), Position: 1, Start: 0, End: 8, }, }, }, // definite article { input: []byte("الكتاب"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("كتاب"), Position: 1, Start: 0, End: 12, }, }, }, { input: []byte("ما ملكت أيمانكم"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("ملكت"), Position: 2, Start: 5, End: 13, }, &analysis.Token{ Term: []byte("ايمانكم"), Position: 3, Start: 14, End: 28, }, }, }, // stopwords { input: []byte("الذين ملكت أيمانكم"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("ملكت"), Position: 2, Start: 11, End: 19, }, &analysis.Token{ Term: []byte("ايمانكم"), Position: 3, Start: 20, End: 34, }, }, }, // presentation form normalization { input: []byte("ﺍﻟﺴﻼﻢ"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("سلام"), Position: 1, Start: 0, End: 15, }, }, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term) } } }
func TestFrenchAnalyzer(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream }{ { input: []byte(""), output: analysis.TokenStream{}, }, { input: []byte("chien chat cheval"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("chien"), }, &analysis.Token{ Term: []byte("chat"), }, &analysis.Token{ Term: []byte("cheval"), }, }, }, { input: []byte("chien CHAT CHEVAL"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("chien"), }, &analysis.Token{ Term: []byte("chat"), }, &analysis.Token{ Term: []byte("cheval"), }, }, }, { input: []byte(" chien ,? + = - CHAT /: > CHEVAL"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("chien"), }, &analysis.Token{ Term: []byte("chat"), }, &analysis.Token{ Term: []byte("cheval"), }, }, }, { input: []byte("chien++"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("chien"), }, }, }, { input: []byte("mot \"entreguillemet\""), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("mot"), }, &analysis.Token{ Term: []byte("entreguilemet"), }, }, }, { input: []byte("Jean-François"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("jean"), }, &analysis.Token{ Term: []byte("francoi"), }, }, }, // stop words { input: []byte("le la chien les aux chat du des à cheval"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("chien"), }, &analysis.Token{ Term: []byte("chat"), }, &analysis.Token{ Term: []byte("cheval"), }, }, }, // nouns and adjectives { input: []byte("lances chismes habitable chiste éléments captifs"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("lanc"), }, &analysis.Token{ Term: []byte("chism"), }, &analysis.Token{ Term: []byte("habitabl"), }, &analysis.Token{ Term: []byte("chist"), }, &analysis.Token{ Term: []byte("element"), }, &analysis.Token{ Term: []byte("captif"), }, }, }, // verbs { input: []byte("finissions souffrirent rugissante"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("finision"), }, &analysis.Token{ Term: []byte("soufrirent"), }, &analysis.Token{ Term: []byte("rugisant"), }, }, }, { input: []byte("C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ "), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("c3po"), }, &analysis.Token{ Term: []byte("aujourd'hui"), }, &analysis.Token{ Term: []byte("oeuf"), }, &analysis.Token{ Term: []byte("ïaöuaä"), }, &analysis.Token{ Term: []byte("anticonstitutionel"), }, &analysis.Token{ Term: []byte("java"), }, }, }, } cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(AnalyzerName) if err != nil { t.Fatal(err) } for _, test := range tests { actual := analyzer.Analyze(test.input) if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } for i, tok := range actual { if !reflect.DeepEqual(tok.Term, test.output[i].Term) { t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) } } } }