func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { var charFilters []analysis.CharFilter charFilterNames, ok := config["char_filters"].([]interface{}) if ok { charFilters = make([]analysis.CharFilter, len(charFilterNames)) for i, charFilterName := range charFilterNames { charFilterNameString, ok := charFilterName.(string) if ok { charFilter, err := cache.CharFilterNamed(charFilterNameString) if err != nil { return nil, err } charFilters[i] = charFilter } else { return nil, fmt.Errorf("char filter name must be a string") } } } tokenizerName, ok := config["tokenizer"].(string) if !ok { return nil, fmt.Errorf("must specify tokenizer") } tokenizer, err := cache.TokenizerNamed(tokenizerName) if err != nil { return nil, err } var tokenFilters []analysis.TokenFilter tokenFilterNames, ok := config["token_filters"].([]interface{}) if ok { tokenFilters = make([]analysis.TokenFilter, len(tokenFilterNames)) for i, tokenFilterName := range tokenFilterNames { tokenFilterNameString, ok := tokenFilterName.(string) if ok { tokenFilter, err := cache.TokenFilterNamed(tokenFilterNameString) if err != nil { return nil, err } tokenFilters[i] = tokenFilter } else { return nil, fmt.Errorf("token filter name must be a string") } } } rv := analysis.Analyzer{ Tokenizer: tokenizer, } if charFilters != nil { rv.CharFilters = charFilters } if tokenFilters != nil { rv.TokenFilters = tokenFilters } return &rv, nil }
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { var err error var charFilters []analysis.CharFilter charFiltersNames, ok := config["char_filters"].([]string) if ok { charFilters, err = getCharFilters(charFiltersNames, cache) if err != nil { return nil, err } } else { charFiltersNamesInterfaceSlice, ok := config["char_filters"].([]interface{}) if ok { charFiltersNames, err := convertInterfaceSliceToStringSlice(charFiltersNamesInterfaceSlice, "char filter") if err != nil { return nil, err } charFilters, err = getCharFilters(charFiltersNames, cache) if err != nil { return nil, err } } } tokenizerName, ok := config["tokenizer"].(string) if !ok { return nil, fmt.Errorf("must specify tokenizer") } tokenizer, err := cache.TokenizerNamed(tokenizerName) if err != nil { return nil, err } var tokenFilters []analysis.TokenFilter tokenFiltersNames, ok := config["token_filters"].([]string) if ok { tokenFilters, err = getTokenFilters(tokenFiltersNames, cache) if err != nil { return nil, err } } else { tokenFiltersNamesInterfaceSlice, ok := config["token_filters"].([]interface{}) if ok { tokenFiltersNames, err := convertInterfaceSliceToStringSlice(tokenFiltersNamesInterfaceSlice, "token filter") if err != nil { return nil, err } tokenFilters, err = getTokenFilters(tokenFiltersNames, cache) if err != nil { return nil, err } } } rv := analysis.Analyzer{ Tokenizer: tokenizer, } if charFilters != nil { rv.CharFilters = charFilters } if tokenFilters != nil { rv.TokenFilters = tokenFilters } return &rv, nil }
func TestSoraniStemmerFilter(t *testing.T) { // in order to match the lucene tests // we will test with an analyzer, not just the stemmer analyzer := analysis.Analyzer{ Tokenizer: single_token.NewSingleTokenTokenizer(), TokenFilters: []analysis.TokenFilter{ NewSoraniNormalizeFilter(), NewSoraniStemmerFilter(), }, } tests := []struct { input []byte output analysis.TokenStream }{ { // -ek input: []byte("پیاوێک"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("پیاو"), Position: 1, Start: 0, End: 12, }, }, }, { // -yek input: []byte("دەرگایەک"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("دەرگا"), Position: 1, Start: 0, End: 16, }, }, }, { // -aka input: []byte("پیاوەكە"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("پیاو"), Position: 1, Start: 0, End: 14, }, }, }, { // -ka input: []byte("دەرگاكە"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("دەرگا"), Position: 1, Start: 0, End: 14, }, }, }, { // -a input: []byte("کتاویە"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("کتاوی"), Position: 1, Start: 0, End: 12, }, }, }, { // -ya input: []byte("دەرگایە"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("دەرگا"), Position: 1, Start: 0, End: 14, }, }, }, { // -An input: []byte("پیاوان"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("پیاو"), Position: 1, Start: 0, End: 12, }, }, }, { // -yAn input: []byte("دەرگایان"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("دەرگا"), Position: 1, Start: 0, End: 16, }, }, }, { // -akAn input: []byte("پیاوەکان"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("پیاو"), Position: 1, Start: 0, End: 16, }, }, }, { // -kAn input: []byte("دەرگاکان"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("دەرگا"), Position: 1, Start: 0, End: 16, }, }, }, { // -Ana input: []byte("پیاوانە"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("پیاو"), Position: 1, Start: 0, End: 14, }, }, }, { // -yAna input: []byte("دەرگایانە"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("دەرگا"), Position: 1, Start: 0, End: 18, }, }, }, { // Ezafe singular input: []byte("هۆتیلی"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("هۆتیل"), Position: 1, Start: 0, End: 12, }, }, }, { // Ezafe indefinite input: []byte("هۆتیلێکی"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("هۆتیل"), Position: 1, Start: 0, End: 16, }, }, }, { // Ezafe plural input: []byte("هۆتیلانی"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("هۆتیل"), Position: 1, Start: 0, End: 16, }, }, }, { // -awa input: []byte("دوورەوە"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("دوور"), Position: 1, Start: 0, End: 14, }, }, }, { // -dA input: []byte("نیوەشەودا"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("نیوەشەو"), Position: 1, Start: 0, End: 18, }, }, }, { // -A input: []byte("سۆرانا"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("سۆران"), Position: 1, Start: 0, End: 12, }, }, }, { // -mAn input: []byte("پارەمان"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("پارە"), Position: 1, Start: 0, End: 14, }, }, }, { // -tAn input: []byte("پارەتان"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("پارە"), Position: 1, Start: 0, End: 14, }, }, }, { // -yAn input: []byte("پارەیان"), output: analysis.TokenStream{ &analysis.Token{ Term: []byte("پارە"), Position: 1, Start: 0, End: 14, }, }, }, { // empty input: []byte(""), output: analysis.TokenStream{ &analysis.Token{ Term: []byte(""), Position: 1, Start: 0, End: 0, }, }, }, } for _, test := range tests { actual := analyzer.Analyze(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("for input %s(% x)", test.input, test.input) t.Errorf("\texpected:") for _, token := range test.output { t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term) } t.Errorf("\tactual:") for _, token := range actual { t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term) } } } }