Exemple #1
0
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {

	var charFilters []analysis.CharFilter
	charFilterNames, ok := config["char_filters"].([]interface{})
	if ok {
		charFilters = make([]analysis.CharFilter, len(charFilterNames))
		for i, charFilterName := range charFilterNames {
			charFilterNameString, ok := charFilterName.(string)
			if ok {
				charFilter, err := cache.CharFilterNamed(charFilterNameString)
				if err != nil {
					return nil, err
				}
				charFilters[i] = charFilter
			} else {
				return nil, fmt.Errorf("char filter name must be a string")
			}
		}
	}

	tokenizerName, ok := config["tokenizer"].(string)
	if !ok {
		return nil, fmt.Errorf("must specify tokenizer")
	}

	tokenizer, err := cache.TokenizerNamed(tokenizerName)
	if err != nil {
		return nil, err
	}

	var tokenFilters []analysis.TokenFilter
	tokenFilterNames, ok := config["token_filters"].([]interface{})
	if ok {
		tokenFilters = make([]analysis.TokenFilter, len(tokenFilterNames))
		for i, tokenFilterName := range tokenFilterNames {
			tokenFilterNameString, ok := tokenFilterName.(string)
			if ok {
				tokenFilter, err := cache.TokenFilterNamed(tokenFilterNameString)
				if err != nil {
					return nil, err
				}
				tokenFilters[i] = tokenFilter
			} else {
				return nil, fmt.Errorf("token filter name must be a string")
			}
		}
	}

	rv := analysis.Analyzer{
		Tokenizer: tokenizer,
	}
	if charFilters != nil {
		rv.CharFilters = charFilters
	}
	if tokenFilters != nil {
		rv.TokenFilters = tokenFilters
	}
	return &rv, nil
}
Exemple #2
0
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {

	var err error
	var charFilters []analysis.CharFilter
	charFiltersNames, ok := config["char_filters"].([]string)
	if ok {
		charFilters, err = getCharFilters(charFiltersNames, cache)
		if err != nil {
			return nil, err
		}
	} else {
		charFiltersNamesInterfaceSlice, ok := config["char_filters"].([]interface{})
		if ok {
			charFiltersNames, err := convertInterfaceSliceToStringSlice(charFiltersNamesInterfaceSlice, "char filter")
			if err != nil {
				return nil, err
			}
			charFilters, err = getCharFilters(charFiltersNames, cache)
			if err != nil {
				return nil, err
			}
		}
	}

	tokenizerName, ok := config["tokenizer"].(string)
	if !ok {
		return nil, fmt.Errorf("must specify tokenizer")
	}

	tokenizer, err := cache.TokenizerNamed(tokenizerName)
	if err != nil {
		return nil, err
	}

	var tokenFilters []analysis.TokenFilter
	tokenFiltersNames, ok := config["token_filters"].([]string)
	if ok {
		tokenFilters, err = getTokenFilters(tokenFiltersNames, cache)
		if err != nil {
			return nil, err
		}
	} else {
		tokenFiltersNamesInterfaceSlice, ok := config["token_filters"].([]interface{})
		if ok {
			tokenFiltersNames, err := convertInterfaceSliceToStringSlice(tokenFiltersNamesInterfaceSlice, "token filter")
			if err != nil {
				return nil, err
			}
			tokenFilters, err = getTokenFilters(tokenFiltersNames, cache)
			if err != nil {
				return nil, err
			}
		}
	}

	rv := analysis.Analyzer{
		Tokenizer: tokenizer,
	}
	if charFilters != nil {
		rv.CharFilters = charFilters
	}
	if tokenFilters != nil {
		rv.TokenFilters = tokenFilters
	}
	return &rv, nil
}
func TestSoraniStemmerFilter(t *testing.T) {

	// in order to match the lucene tests
	// we will test with an analyzer, not just the stemmer
	analyzer := analysis.Analyzer{
		Tokenizer: single_token.NewSingleTokenTokenizer(),
		TokenFilters: []analysis.TokenFilter{
			NewSoraniNormalizeFilter(),
			NewSoraniStemmerFilter(),
		},
	}

	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		{ // -ek
			input: []byte("پیاوێک"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("پیاو"),
					Position: 1,
					Start:    0,
					End:      12,
				},
			},
		},
		{ // -yek
			input: []byte("دەرگایەک"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("دەرگا"),
					Position: 1,
					Start:    0,
					End:      16,
				},
			},
		},
		{ // -aka
			input: []byte("پیاوەكە"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("پیاو"),
					Position: 1,
					Start:    0,
					End:      14,
				},
			},
		},
		{ // -ka
			input: []byte("دەرگاكە"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("دەرگا"),
					Position: 1,
					Start:    0,
					End:      14,
				},
			},
		},
		{ // -a
			input: []byte("کتاویە"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("کتاوی"),
					Position: 1,
					Start:    0,
					End:      12,
				},
			},
		},
		{ // -ya
			input: []byte("دەرگایە"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("دەرگا"),
					Position: 1,
					Start:    0,
					End:      14,
				},
			},
		},
		{ // -An
			input: []byte("پیاوان"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("پیاو"),
					Position: 1,
					Start:    0,
					End:      12,
				},
			},
		},
		{ // -yAn
			input: []byte("دەرگایان"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("دەرگا"),
					Position: 1,
					Start:    0,
					End:      16,
				},
			},
		},
		{ // -akAn
			input: []byte("پیاوەکان"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("پیاو"),
					Position: 1,
					Start:    0,
					End:      16,
				},
			},
		},
		{ // -kAn
			input: []byte("دەرگاکان"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("دەرگا"),
					Position: 1,
					Start:    0,
					End:      16,
				},
			},
		},
		{ // -Ana
			input: []byte("پیاوانە"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("پیاو"),
					Position: 1,
					Start:    0,
					End:      14,
				},
			},
		},
		{ // -yAna
			input: []byte("دەرگایانە"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("دەرگا"),
					Position: 1,
					Start:    0,
					End:      18,
				},
			},
		},
		{ // Ezafe singular
			input: []byte("هۆتیلی"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("هۆتیل"),
					Position: 1,
					Start:    0,
					End:      12,
				},
			},
		},
		{ // Ezafe indefinite
			input: []byte("هۆتیلێکی"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("هۆتیل"),
					Position: 1,
					Start:    0,
					End:      16,
				},
			},
		},
		{ // Ezafe plural
			input: []byte("هۆتیلانی"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("هۆتیل"),
					Position: 1,
					Start:    0,
					End:      16,
				},
			},
		},
		{ // -awa
			input: []byte("دوورەوە"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("دوور"),
					Position: 1,
					Start:    0,
					End:      14,
				},
			},
		},
		{ // -dA
			input: []byte("نیوەشەودا"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("نیوەشەو"),
					Position: 1,
					Start:    0,
					End:      18,
				},
			},
		},
		{ // -A
			input: []byte("سۆرانا"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("سۆران"),
					Position: 1,
					Start:    0,
					End:      12,
				},
			},
		},
		{ // -mAn
			input: []byte("پارەمان"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("پارە"),
					Position: 1,
					Start:    0,
					End:      14,
				},
			},
		},
		{ // -tAn
			input: []byte("پارەتان"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("پارە"),
					Position: 1,
					Start:    0,
					End:      14,
				},
			},
		},
		{ // -yAn
			input: []byte("پارەیان"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("پارە"),
					Position: 1,
					Start:    0,
					End:      14,
				},
			},
		},
		{ // empty
			input: []byte(""),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte(""),
					Position: 1,
					Start:    0,
					End:      0,
				},
			},
		},
	}

	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("for input %s(% x)", test.input, test.input)
			t.Errorf("\texpected:")
			for _, token := range test.output {
				t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term)
			}
			t.Errorf("\tactual:")
			for _, token := range actual {
				t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term)
			}
		}
	}
}