Beispiel #1
0
func BenchmarkBatch(b *testing.B) {

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name)
	if err != nil {
		b.Fatal(err)
	}

	analysisQueue := index.NewAnalysisQueue(1)
	idx, err := NewUpsideDownCouch(null.Name, nil, analysisQueue)
	if err != nil {
		b.Fatal(err)
	}
	err = idx.Open()
	if err != nil {
		b.Fatal(err)
	}

	batch := index.NewBatch()
	for i := 0; i < 100; i++ {
		d := document.NewDocument(strconv.Itoa(i))
		f := document.NewTextFieldWithAnalyzer("desc", nil, bleveWikiArticle1K, analyzer)
		d.AddField(f)
		batch.Update(d)
	}

	b.ResetTimer()

	for i := 0; i < b.N; i++ {
		err = idx.Batch(batch)
		if err != nil {
			b.Fatal(err)
		}
	}
}
Beispiel #2
0
func TestFrenchElision(t *testing.T) {
	tests := []struct {
		input  analysis.TokenStream
		output analysis.TokenStream
	}{
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("l'avion"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("avion"),
				},
			},
		},
	}

	cache := registry.NewCache()
	elisionFilter, err := cache.TokenFilterNamed(ElisionName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := elisionFilter.Filter(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
		}
	}
}
Beispiel #3
0
func BenchmarkAnalyze(b *testing.B) {

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name)
	if err != nil {
		b.Fatal(err)
	}

	analysisQueue := index.NewAnalysisQueue(1)
	idx, err := NewFirestorm(null.Name, nil, analysisQueue)
	if err != nil {
		b.Fatal(err)
	}

	d := document.NewDocument("1")
	f := document.NewTextFieldWithAnalyzer("desc", nil, bleveWikiArticle1K, analyzer)
	d.AddField(f)

	b.ResetTimer()

	for i := 0; i < b.N; i++ {
		rv := idx.Analyze(d)
		if len(rv.Rows) < 92 || len(rv.Rows) > 93 {
			b.Fatalf("expected 512-13 rows, got %d", len(rv.Rows))
		}
	}
}
Beispiel #4
0
func TestElisionFilter(t *testing.T) {

	tests := []struct {
		input  analysis.TokenStream
		output analysis.TokenStream
	}{
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("ar" + string(Apostrophe) + "word"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("word"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("ar" + string(RightSingleQuotationMark) + "word"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("word"),
				},
			},
		},
	}

	cache := registry.NewCache()

	articleListConfig := map[string]interface{}{
		"type":   token_map.Name,
		"tokens": []interface{}{"ar"},
	}
	_, err := cache.DefineTokenMap("articles_test", articleListConfig)
	if err != nil {
		t.Fatal(err)
	}

	elisionConfig := map[string]interface{}{
		"type":               "elision",
		"articles_token_map": "articles_test",
	}
	elisionFilter, err := cache.DefineTokenFilter("elision_test", elisionConfig)
	if err != nil {
		t.Fatal(err)
	}

	for _, test := range tests {

		actual := elisionFilter.Filter(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
		}
	}
}
func CommonBenchmarkIndexBatch(b *testing.B, storeName string, storeConfig map[string]interface{}, destroy KVStoreDestroy, analysisWorkers, batchSize int) {

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed("standard")
	if err != nil {
		b.Fatal(err)
	}

	b.ResetTimer()
	b.StopTimer()
	for i := 0; i < b.N; i++ {

		analysisQueue := index.NewAnalysisQueue(analysisWorkers)
		idx, err := NewUpsideDownCouch(storeName, storeConfig, analysisQueue)
		if err != nil {
			b.Fatal(err)
		}

		err = idx.Open()
		if err != nil {
			b.Fatal(err)
		}

		b.StartTimer()
		batch := index.NewBatch()
		for j := 0; j < 1000; j++ {
			if j%batchSize == 0 {
				if len(batch.IndexOps) > 0 {
					err := idx.Batch(batch)
					if err != nil {
						b.Fatal(err)
					}
				}
				batch = index.NewBatch()
			}
			indexDocument := document.NewDocument("").
				AddField(document.NewTextFieldWithAnalyzer("body", []uint64{}, []byte(benchmarkDocBodies[j%10]), analyzer))
			indexDocument.ID = strconv.Itoa(i) + "-" + strconv.Itoa(j)
			batch.Update(indexDocument)
		}
		// close last batch
		if len(batch.IndexOps) > 0 {
			err := idx.Batch(batch)
			if err != nil {
				b.Fatal(err)
			}
		}
		b.StopTimer()
		err = idx.Close()
		if err != nil {
			b.Fatal(err)
		}
		err = destroy()
		if err != nil {
			b.Fatal(err)
		}
		analysisQueue.Close()
	}
}
Beispiel #6
0
func TestStopWordsFilterLongestMatch(t *testing.T) {

	inputTokenStream := analysis.TokenStream{
		&analysis.Token{
			Term:     []byte("softestball"),
			Start:    0,
			End:      11,
			Position: 1,
		},
	}

	expectedTokenStream := analysis.TokenStream{
		&analysis.Token{
			Term:     []byte("softestball"),
			Start:    0,
			End:      11,
			Position: 1,
		},
		&analysis.Token{
			Term:     []byte("softest"),
			Start:    0,
			End:      7,
			Position: 1,
		},
		&analysis.Token{
			Term:     []byte("ball"),
			Start:    7,
			End:      11,
			Position: 1,
		},
	}

	cache := registry.NewCache()
	dictListConfig := map[string]interface{}{
		"type":   token_map.Name,
		"tokens": []interface{}{"soft", "softest", "ball"},
	}
	_, err := cache.DefineTokenMap("dict_test", dictListConfig)
	if err != nil {
		t.Fatal(err)
	}

	dictConfig := map[string]interface{}{
		"type":               "dict_compound",
		"dict_token_map":     "dict_test",
		"only_longest_match": true,
	}
	dictFilter, err := cache.DefineTokenFilter("dict_test", dictConfig)
	if err != nil {
		t.Fatal(err)
	}

	ouputTokenStream := dictFilter.Filter(inputTokenStream)
	if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
		t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
	}
}
func TestStopWordsFilter(t *testing.T) {

	inputTokenStream := analysis.TokenStream{
		&analysis.Token{
			Term: []byte("a"),
		},
		&analysis.Token{
			Term: []byte("walk"),
		},
		&analysis.Token{
			Term: []byte("in"),
		},
		&analysis.Token{
			Term: []byte("the"),
		},
		&analysis.Token{
			Term: []byte("park"),
		},
	}

	expectedTokenStream := analysis.TokenStream{
		&analysis.Token{
			Term: []byte("walk"),
		},
		&analysis.Token{
			Term: []byte("park"),
		},
	}

	cache := registry.NewCache()
	stopListConfig := map[string]interface{}{
		"type":   token_map.Name,
		"tokens": []interface{}{"a", "in", "the"},
	}
	_, err := cache.DefineTokenMap("stop_test", stopListConfig)
	if err != nil {
		t.Fatal(err)
	}

	stopConfig := map[string]interface{}{
		"type":           "stop_tokens",
		"stop_token_map": "stop_test",
	}
	stopFilter, err := cache.DefineTokenFilter("stop_test", stopConfig)
	if err != nil {
		t.Fatal(err)
	}

	ouputTokenStream := stopFilter.Filter(inputTokenStream)
	if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
		t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
	}
}
Beispiel #8
0
func TestSoraniAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		// stop word removal
		{
			input: []byte("ئەم پیاوە"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("پیاو"),
					Position: 2,
					Start:    7,
					End:      17,
				},
			},
		},
		{
			input: []byte("پیاوە"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("پیاو"),
					Position: 1,
					Start:    0,
					End:      10,
				},
			},
		},
		{
			input: []byte("پیاو"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("پیاو"),
					Position: 1,
					Start:    0,
					End:      8,
				},
			},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %v, got %v", test.output, actual)
		}
	}
}
Beispiel #9
0
// NewIndexMapping creates a new IndexMapping that will use all the default indexing rules
func NewIndexMapping() *IndexMapping {
	return &IndexMapping{
		TypeMapping:           make(map[string]*DocumentMapping),
		DefaultMapping:        NewDocumentMapping(),
		TypeField:             defaultTypeField,
		DefaultType:           defaultType,
		DefaultAnalyzer:       defaultAnalyzer,
		DefaultDateTimeParser: defaultDateTimeParser,
		DefaultField:          defaultField,
		ByteArrayConverter:    defaultByteArrayConverter,
		CustomAnalysis:        newCustomAnalysis(),
		cache:                 registry.NewCache(),
	}
}
Beispiel #10
0
func TestJaAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		{
			input: []byte("こんにちは世界"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("こんにちは"),
					Type:     analysis.Ideographic,
					Position: 1,
					Start:    0,
					End:      15,
				},
				&analysis.Token{
					Term:     []byte("世界"),
					Type:     analysis.Ideographic,
					Position: 2,
					Start:    15,
					End:      21,
				},
			},
		},
		{
			input: []byte("カタカナ"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("カタカナ"),
					Type:     analysis.Ideographic,
					Position: 1,
					Start:    0,
					End:      12,
				},
			},
		},
	}

	cache := registry.NewCache()
	for _, test := range tests {
		analyzer, err := cache.AnalyzerNamed(AnalyzerName)
		if err != nil {
			t.Fatal(err)
		}
		actual := analyzer.Analyze(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %v, got %v", test.output, actual)
		}
	}
}
Beispiel #11
0
func BenchmarkAnalysis(b *testing.B) {
	for i := 0; i < b.N; i++ {

		cache := registry.NewCache()
		analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name)
		if err != nil {
			b.Fatal(err)
		}

		ts := analyzer.Analyze(bleveWikiArticle)
		freqs := analysis.TokenFrequency(ts, nil, true)
		if len(freqs) != 511 {
			b.Errorf("expected %d freqs, got %d", 511, len(freqs))
		}
	}
}
Beispiel #12
0
func TestPortugueseAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		// stemming
		{
			input: []byte("quilométricas"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("quilometric"),
				},
			},
		},
		{
			input: []byte("quilométricos"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("quilometric"),
				},
			},
		},
		// stop word
		{
			input:  []byte("não"),
			output: analysis.TokenStream{},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if len(actual) != len(test.output) {
			t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
		}
		for i, tok := range actual {
			if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
				t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
			}
		}
	}
}
Beispiel #13
0
func CommonBenchmarkIndex(b *testing.B, storeName string, storeConfig map[string]interface{}, destroy KVStoreDestroy, analysisWorkers int) {

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed("standard")
	if err != nil {
		b.Fatal(err)
	}

	indexDocument := document.NewDocument("").
		AddField(document.NewTextFieldWithAnalyzer("body", []uint64{}, []byte(benchmarkDocBodies[0]), analyzer))

	b.ResetTimer()
	b.StopTimer()
	for i := 0; i < b.N; i++ {
		analysisQueue := index.NewAnalysisQueue(analysisWorkers)
		idx, err := NewUpsideDownCouch(storeName, storeConfig, analysisQueue)
		if err != nil {
			b.Fatal(err)
		}

		err = idx.Open()
		if err != nil {
			b.Fatal(err)
		}
		indexDocument.ID = strconv.Itoa(i)
		// just time the indexing portion
		b.StartTimer()
		err = idx.Update(indexDocument)
		if err != nil {
			b.Fatal(err)
		}
		b.StopTimer()
		err = idx.Close()
		if err != nil {
			b.Fatal(err)
		}
		err = destroy()
		if err != nil {
			b.Fatal(err)
		}
		analysisQueue.Close()
	}
}
Beispiel #14
0
func TestItalianLightStemmer(t *testing.T) {
	tests := []struct {
		input  analysis.TokenStream
		output analysis.TokenStream
	}{
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("ragazzo"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("ragazz"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("ragazzi"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("ragazz"),
				},
			},
		},
	}

	cache := registry.NewCache()
	filter, err := cache.TokenFilterNamed(LightStemmerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := filter.Filter(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
		}
	}
}
Beispiel #15
0
func TestHindiAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		// two ways to write 'hindi' itself
		{
			input: []byte("हिन्दी"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("हिंद"),
					Position: 1,
					Start:    0,
					End:      18,
				},
			},
		},
		{
			input: []byte("हिंदी"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("हिंद"),
					Position: 1,
					Start:    0,
					End:      15,
				},
			},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %v, got %v", test.output, actual)
		}
	}
}
Beispiel #16
0
func TestCJKAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		{
			input: []byte("こんにちは世界"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("こん"),
					Type:     analysis.Double,
					Position: 1,
					Start:    0,
					End:      6,
				},
				&analysis.Token{
					Term:     []byte("んに"),
					Type:     analysis.Double,
					Position: 2,
					Start:    3,
					End:      9,
				},
				&analysis.Token{
					Term:     []byte("にち"),
					Type:     analysis.Double,
					Position: 3,
					Start:    6,
					End:      12,
				},
				&analysis.Token{
					Term:     []byte("ちは"),
					Type:     analysis.Double,
					Position: 4,
					Start:    9,
					End:      15,
				},
				&analysis.Token{
					Term:     []byte("は世"),
					Type:     analysis.Double,
					Position: 5,
					Start:    12,
					End:      18,
				},
				&analysis.Token{
					Term:     []byte("世界"),
					Type:     analysis.Double,
					Position: 6,
					Start:    15,
					End:      21,
				},
			},
		},
		{
			input: []byte("一二三四五六七八九十"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("一二"),
					Type:     analysis.Double,
					Position: 1,
					Start:    0,
					End:      6,
				},
				&analysis.Token{
					Term:     []byte("二三"),
					Type:     analysis.Double,
					Position: 2,
					Start:    3,
					End:      9,
				},
				&analysis.Token{
					Term:     []byte("三四"),
					Type:     analysis.Double,
					Position: 3,
					Start:    6,
					End:      12,
				},
				&analysis.Token{
					Term:     []byte("四五"),
					Type:     analysis.Double,
					Position: 4,
					Start:    9,
					End:      15,
				},
				&analysis.Token{
					Term:     []byte("五六"),
					Type:     analysis.Double,
					Position: 5,
					Start:    12,
					End:      18,
				},
				&analysis.Token{
					Term:     []byte("六七"),
					Type:     analysis.Double,
					Position: 6,
					Start:    15,
					End:      21,
				},
				&analysis.Token{
					Term:     []byte("七八"),
					Type:     analysis.Double,
					Position: 7,
					Start:    18,
					End:      24,
				},
				&analysis.Token{
					Term:     []byte("八九"),
					Type:     analysis.Double,
					Position: 8,
					Start:    21,
					End:      27,
				},
				&analysis.Token{
					Term:     []byte("九十"),
					Type:     analysis.Double,
					Position: 9,
					Start:    24,
					End:      30,
				},
			},
		},
		{
			input: []byte("一 二三四 五六七八九 十"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("一"),
					Type:     analysis.Single,
					Position: 1,
					Start:    0,
					End:      3,
				},
				&analysis.Token{
					Term:     []byte("二三"),
					Type:     analysis.Double,
					Position: 2,
					Start:    4,
					End:      10,
				},
				&analysis.Token{
					Term:     []byte("三四"),
					Type:     analysis.Double,
					Position: 3,
					Start:    7,
					End:      13,
				},
				&analysis.Token{
					Term:     []byte("五六"),
					Type:     analysis.Double,
					Position: 5,
					Start:    14,
					End:      20,
				},
				&analysis.Token{
					Term:     []byte("六七"),
					Type:     analysis.Double,
					Position: 6,
					Start:    17,
					End:      23,
				},
				&analysis.Token{
					Term:     []byte("七八"),
					Type:     analysis.Double,
					Position: 7,
					Start:    20,
					End:      26,
				},
				&analysis.Token{
					Term:     []byte("八九"),
					Type:     analysis.Double,
					Position: 8,
					Start:    23,
					End:      29,
				},
				&analysis.Token{
					Term:     []byte("十"),
					Type:     analysis.Single,
					Position: 10,
					Start:    30,
					End:      33,
				},
			},
		},
		{
			input: []byte("abc defgh ijklmn opqrstu vwxy z"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("abc"),
					Type:     analysis.AlphaNumeric,
					Position: 1,
					Start:    0,
					End:      3,
				},
				&analysis.Token{
					Term:     []byte("defgh"),
					Type:     analysis.AlphaNumeric,
					Position: 2,
					Start:    4,
					End:      9,
				},
				&analysis.Token{
					Term:     []byte("ijklmn"),
					Type:     analysis.AlphaNumeric,
					Position: 3,
					Start:    10,
					End:      16,
				},
				&analysis.Token{
					Term:     []byte("opqrstu"),
					Type:     analysis.AlphaNumeric,
					Position: 4,
					Start:    17,
					End:      24,
				},
				&analysis.Token{
					Term:     []byte("vwxy"),
					Type:     analysis.AlphaNumeric,
					Position: 5,
					Start:    25,
					End:      29,
				},
				&analysis.Token{
					Term:     []byte("z"),
					Type:     analysis.AlphaNumeric,
					Position: 6,
					Start:    30,
					End:      31,
				},
			},
		},
		{
			input: []byte("あい"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("あい"),
					Type:     analysis.Double,
					Position: 1,
					Start:    0,
					End:      6,
				},
			},
		},
		{
			input: []byte("あい   "),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("あい"),
					Type:     analysis.Double,
					Position: 1,
					Start:    0,
					End:      6,
				},
			},
		},
		{
			input: []byte("test"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("test"),
					Type:     analysis.AlphaNumeric,
					Position: 1,
					Start:    0,
					End:      4,
				},
			},
		},
		{
			input: []byte("test   "),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("test"),
					Type:     analysis.AlphaNumeric,
					Position: 1,
					Start:    0,
					End:      4,
				},
			},
		},
		{
			input: []byte("あいtest"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("あい"),
					Type:     analysis.Double,
					Position: 1,
					Start:    0,
					End:      6,
				},
				&analysis.Token{
					Term:     []byte("test"),
					Type:     analysis.AlphaNumeric,
					Position: 3,
					Start:    6,
					End:      10,
				},
			},
		},
		{
			input: []byte("testあい    "),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("test"),
					Type:     analysis.AlphaNumeric,
					Position: 1,
					Start:    0,
					End:      4,
				},
				&analysis.Token{
					Term:     []byte("あい"),
					Type:     analysis.Double,
					Position: 2,
					Start:    4,
					End:      10,
				},
			},
		},
		{
			input: []byte("あいうえおabcかきくけこ"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("あい"),
					Type:     analysis.Double,
					Position: 1,
					Start:    0,
					End:      6,
				},
				&analysis.Token{
					Term:     []byte("いう"),
					Type:     analysis.Double,
					Position: 2,
					Start:    3,
					End:      9,
				},
				&analysis.Token{
					Term:     []byte("うえ"),
					Type:     analysis.Double,
					Position: 3,
					Start:    6,
					End:      12,
				},
				&analysis.Token{
					Term:     []byte("えお"),
					Type:     analysis.Double,
					Position: 4,
					Start:    9,
					End:      15,
				},
				&analysis.Token{
					Term:     []byte("abc"),
					Type:     analysis.AlphaNumeric,
					Position: 6,
					Start:    15,
					End:      18,
				},
				&analysis.Token{
					Term:     []byte("かき"),
					Type:     analysis.Double,
					Position: 7,
					Start:    18,
					End:      24,
				},
				&analysis.Token{
					Term:     []byte("きく"),
					Type:     analysis.Double,
					Position: 8,
					Start:    21,
					End:      27,
				},
				&analysis.Token{
					Term:     []byte("くけ"),
					Type:     analysis.Double,
					Position: 9,
					Start:    24,
					End:      30,
				},
				&analysis.Token{
					Term:     []byte("けこ"),
					Type:     analysis.Double,
					Position: 10,
					Start:    27,
					End:      33,
				},
			},
		},
		{
			input: []byte("あいうえおabんcかきくけ こ"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("あい"),
					Type:     analysis.Double,
					Position: 1,
					Start:    0,
					End:      6,
				},
				&analysis.Token{
					Term:     []byte("いう"),
					Type:     analysis.Double,
					Position: 2,
					Start:    3,
					End:      9,
				},
				&analysis.Token{
					Term:     []byte("うえ"),
					Type:     analysis.Double,
					Position: 3,
					Start:    6,
					End:      12,
				},
				&analysis.Token{
					Term:     []byte("えお"),
					Type:     analysis.Double,
					Position: 4,
					Start:    9,
					End:      15,
				},
				&analysis.Token{
					Term:     []byte("ab"),
					Type:     analysis.AlphaNumeric,
					Position: 6,
					Start:    15,
					End:      17,
				},
				&analysis.Token{
					Term:     []byte("ん"),
					Type:     analysis.Single,
					Position: 7,
					Start:    17,
					End:      20,
				},
				&analysis.Token{
					Term:     []byte("c"),
					Type:     analysis.AlphaNumeric,
					Position: 8,
					Start:    20,
					End:      21,
				},
				&analysis.Token{
					Term:     []byte("かき"),
					Type:     analysis.Double,
					Position: 9,
					Start:    21,
					End:      27,
				},
				&analysis.Token{
					Term:     []byte("きく"),
					Type:     analysis.Double,
					Position: 10,
					Start:    24,
					End:      30,
				},
				&analysis.Token{
					Term:     []byte("くけ"),
					Type:     analysis.Double,
					Position: 11,
					Start:    27,
					End:      33,
				},
				&analysis.Token{
					Term:     []byte("こ"),
					Type:     analysis.Single,
					Position: 13,
					Start:    34,
					End:      37,
				},
			},
		},
		{
			input: []byte("一 روبرت موير"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("一"),
					Type:     analysis.Single,
					Position: 1,
					Start:    0,
					End:      3,
				},
				&analysis.Token{
					Term:     []byte("روبرت"),
					Type:     analysis.AlphaNumeric,
					Position: 2,
					Start:    4,
					End:      14,
				},
				&analysis.Token{
					Term:     []byte("موير"),
					Type:     analysis.AlphaNumeric,
					Position: 3,
					Start:    15,
					End:      23,
				},
			},
		},
		{
			input: []byte("一 رُوبرت موير"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("一"),
					Type:     analysis.Single,
					Position: 1,
					Start:    0,
					End:      3,
				},
				&analysis.Token{
					Term:     []byte("رُوبرت"),
					Type:     analysis.AlphaNumeric,
					Position: 2,
					Start:    4,
					End:      16,
				},
				&analysis.Token{
					Term:     []byte("موير"),
					Type:     analysis.AlphaNumeric,
					Position: 3,
					Start:    17,
					End:      25,
				},
			},
		},
		{
			input: []byte("𩬅艱鍟䇹愯瀛"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("𩬅艱"),
					Type:     analysis.Double,
					Position: 1,
					Start:    0,
					End:      7,
				},
				&analysis.Token{
					Term:     []byte("艱鍟"),
					Type:     analysis.Double,
					Position: 2,
					Start:    4,
					End:      10,
				},
				&analysis.Token{
					Term:     []byte("鍟䇹"),
					Type:     analysis.Double,
					Position: 3,
					Start:    7,
					End:      13,
				},
				&analysis.Token{
					Term:     []byte("䇹愯"),
					Type:     analysis.Double,
					Position: 4,
					Start:    10,
					End:      16,
				},
				&analysis.Token{
					Term:     []byte("愯瀛"),
					Type:     analysis.Double,
					Position: 5,
					Start:    13,
					End:      19,
				},
			},
		},
		{
			input: []byte("一"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("一"),
					Type:     analysis.Single,
					Position: 1,
					Start:    0,
					End:      3,
				},
			},
		},
		{
			input: []byte("一丁丂"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("一丁"),
					Type:     analysis.Double,
					Position: 1,
					Start:    0,
					End:      6,
				},
				&analysis.Token{
					Term:     []byte("丁丂"),
					Type:     analysis.Double,
					Position: 2,
					Start:    3,
					End:      9,
				},
			},
		},
	}

	cache := registry.NewCache()
	for _, test := range tests {
		analyzer, err := cache.AnalyzerNamed(AnalyzerName)
		if err != nil {
			t.Fatal(err)
		}
		actual := analyzer.Analyze(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %v, got %v", test.output, actual)
		}
	}
}
Beispiel #17
0
func TestPersianAnalyzerOthers(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		// nouns
		{
			input: []byte("برگ ها"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("برگ"),
				},
			},
		},
		{
			input: []byte("برگ‌ها"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("برگ"),
				},
			},
		},
		// non persian
		{
			input: []byte("English test."),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("english"),
				},
				&analysis.Token{
					Term: []byte("test"),
				},
			},
		},
		// others
		{
			input: []byte("خورده مي شده بوده باشد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		{
			input: []byte("برگ‌ها"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("برگ"),
				},
			},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if len(actual) != len(test.output) {
			t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
		}
		for i, tok := range actual {
			if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
				t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
			}
		}
	}
}
Beispiel #18
0
func TestPersianAnalyzerVerbsDefective(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		// active present indicative
		{
			input: []byte("مي خورد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورد"),
				},
			},
		},
		// active preterite indicative
		{
			input: []byte("خورد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورد"),
				},
			},
		},
		// active imperfective preterite indicative
		{
			input: []byte("مي خورد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورد"),
				},
			},
		},
		// active future indicative
		{
			input: []byte("خواهد خورد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورد"),
				},
			},
		},
		// active present progressive indicative
		{
			input: []byte("دارد مي خورد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورد"),
				},
			},
		},
		// active preterite progressive indicative
		{
			input: []byte("داشت مي خورد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورد"),
				},
			},
		},
		// active perfect indicative
		{
			input: []byte("خورده است"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// active imperfective perfect indicative
		{
			input: []byte("مي خورده است"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// active pluperfect indicative
		{
			input: []byte("خورده بود"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// active imperfective pluperfect indicative
		{
			input: []byte("مي خورده بود"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// active preterite subjunctive
		{
			input: []byte("خورده باشد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// active imperfective preterite subjunctive
		{
			input: []byte("مي خورده باشد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// active pluperfect subjunctive
		{
			input: []byte("خورده بوده باشد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// active imperfective pluperfect subjunctive
		{
			input: []byte("مي خورده بوده باشد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// passive present indicative
		{
			input: []byte("خورده مي شود"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// passive preterite indicative
		{
			input: []byte("خورده شد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// passive imperfective preterite indicative
		{
			input: []byte("خورده مي شد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// passive perfect indicative
		{
			input: []byte("خورده شده است"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// passive imperfective perfect indicative
		{
			input: []byte("خورده مي شده است"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// passive pluperfect indicative
		{
			input: []byte("خورده شده بود"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// passive imperfective pluperfect indicative
		{
			input: []byte("خورده مي شده بود"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// passive future indicative
		{
			input: []byte("خورده خواهد شد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// passive present progressive indicative
		{
			input: []byte("دارد خورده مي شود"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// passive preterite progressive indicative
		{
			input: []byte("داشت خورده مي شد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// passive present subjunctive
		{
			input: []byte("خورده شود"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// passive preterite subjunctive
		{
			input: []byte("خورده شده باشد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// passive imperfective preterite subjunctive
		{
			input: []byte("خورده مي شده باشد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// passive pluperfect subjunctive
		{
			input: []byte("خورده شده بوده باشد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// passive imperfective pluperfect subjunctive
		{
			input: []byte("خورده مي شده بوده باشد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("خورده"),
				},
			},
		},
		// active present subjunctive
		{
			input: []byte("بخورد"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("بخورد"),
				},
			},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if len(actual) != len(test.output) {
			t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
		}
		for i, tok := range actual {
			if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
				t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
			}
		}
	}
}
Beispiel #19
0
func TestExceptionsTokenizer(t *testing.T) {
	tests := []struct {
		config   map[string]interface{}
		input    []byte
		patterns []string
		result   analysis.TokenStream
	}{
		{
			input: []byte("test http://blevesearch.com/ words"),
			config: map[string]interface{}{
				"type":      "exception",
				"tokenizer": "unicode",
				"exceptions": []interface{}{
					`[hH][tT][tT][pP][sS]?://(\S)*`,
					`[fF][iI][lL][eE]://(\S)*`,
					`[fF][tT][pP]://(\S)*`,
				},
			},
			result: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("test"),
					Position: 1,
					Start:    0,
					End:      4,
				},
				&analysis.Token{
					Term:     []byte("http://blevesearch.com/"),
					Position: 2,
					Start:    5,
					End:      28,
				},
				&analysis.Token{
					Term:     []byte("words"),
					Position: 3,
					Start:    29,
					End:      34,
				},
			},
		},
		{
			input: []byte("what ftp://blevesearch.com/ songs"),
			config: map[string]interface{}{
				"type":      "exception",
				"tokenizer": "unicode",
				"exceptions": []interface{}{
					`[hH][tT][tT][pP][sS]?://(\S)*`,
					`[fF][iI][lL][eE]://(\S)*`,
					`[fF][tT][pP]://(\S)*`,
				},
			},
			result: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("what"),
					Position: 1,
					Start:    0,
					End:      4,
				},
				&analysis.Token{
					Term:     []byte("ftp://blevesearch.com/"),
					Position: 2,
					Start:    5,
					End:      27,
				},
				&analysis.Token{
					Term:     []byte("songs"),
					Position: 3,
					Start:    28,
					End:      33,
				},
			},
		},
		{
			input: []byte("please email [email protected] the URL https://blevesearch.com/"),
			config: map[string]interface{}{
				"type":      "exception",
				"tokenizer": "unicode",
				"exceptions": []interface{}{
					`[hH][tT][tT][pP][sS]?://(\S)*`,
					`[fF][iI][lL][eE]://(\S)*`,
					`[fF][tT][pP]://(\S)*`,
					`\S+@\S+`,
				},
			},
			result: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("please"),
					Position: 1,
					Start:    0,
					End:      6,
				},
				&analysis.Token{
					Term:     []byte("email"),
					Position: 2,
					Start:    7,
					End:      12,
				},
				&analysis.Token{
					Term:     []byte("*****@*****.**"),
					Position: 3,
					Start:    13,
					End:      32,
				},
				&analysis.Token{
					Term:     []byte("the"),
					Position: 4,
					Start:    33,
					End:      36,
				},
				&analysis.Token{
					Term:     []byte("URL"),
					Position: 5,
					Start:    37,
					End:      40,
				},
				&analysis.Token{
					Term:     []byte("https://blevesearch.com/"),
					Position: 6,
					Start:    41,
					End:      65,
				},
			},
		},
	}

	// remaining := unicode.NewUnicodeTokenizer()
	for _, test := range tests {

		// build the requested exception tokenizer
		cache := registry.NewCache()
		tokenizer, err := cache.DefineTokenizer("custom", test.config)
		if err != nil {
			t.Fatal(err)
		}

		// pattern := strings.Join(test.patterns, "|")
		// r, err := regexp.Compile(pattern)
		// if err != nil {
		// 	t.Fatal(err)
		// }
		// tokenizer := NewExceptionsTokenizer(r, remaining)
		actual := tokenizer.Tokenize(test.input)
		if !reflect.DeepEqual(actual, test.result) {
			t.Errorf("expected %v, got %v", test.result, actual)
		}
	}
}
Beispiel #20
0
// UnmarshalJSON deserializes a JSON representation of the IndexMapping
func (im *IndexMapping) UnmarshalJSON(data []byte) error {
	var tmp struct {
		TypeMapping           map[string]*DocumentMapping `json:"types"`
		DefaultMapping        *DocumentMapping            `json:"default_mapping"`
		TypeField             string                      `json:"type_field"`
		DefaultType           string                      `json:"default_type"`
		DefaultAnalyzer       string                      `json:"default_analyzer"`
		DefaultDateTimeParser string                      `json:"default_datetime_parser"`
		DefaultField          string                      `json:"default_field"`
		ByteArrayConverter    string                      `json:"byte_array_converter"`
		CustomAnalysis        *customAnalysis             `json:"analysis"`
	}
	err := json.Unmarshal(data, &tmp)
	if err != nil {
		return err
	}

	im.cache = registry.NewCache()

	im.CustomAnalysis = newCustomAnalysis()
	if tmp.CustomAnalysis != nil {
		if tmp.CustomAnalysis.CharFilters != nil {
			im.CustomAnalysis.CharFilters = tmp.CustomAnalysis.CharFilters
		}
		if tmp.CustomAnalysis.Tokenizers != nil {
			im.CustomAnalysis.Tokenizers = tmp.CustomAnalysis.Tokenizers
		}
		if tmp.CustomAnalysis.TokenMaps != nil {
			im.CustomAnalysis.TokenMaps = tmp.CustomAnalysis.TokenMaps
		}
		if tmp.CustomAnalysis.TokenFilters != nil {
			im.CustomAnalysis.TokenFilters = tmp.CustomAnalysis.TokenFilters
		}
		if tmp.CustomAnalysis.Analyzers != nil {
			im.CustomAnalysis.Analyzers = tmp.CustomAnalysis.Analyzers
		}
		if tmp.CustomAnalysis.DateTimeParsers != nil {
			im.CustomAnalysis.DateTimeParsers = tmp.CustomAnalysis.DateTimeParsers
		}
	}

	im.TypeField = defaultTypeField
	if tmp.TypeField != "" {
		im.TypeField = tmp.TypeField
	}

	im.DefaultType = defaultType
	if tmp.DefaultType != "" {
		im.DefaultType = tmp.DefaultType
	}

	im.DefaultAnalyzer = defaultAnalyzer
	if tmp.DefaultAnalyzer != "" {
		im.DefaultAnalyzer = tmp.DefaultAnalyzer
	}

	im.DefaultDateTimeParser = defaultDateTimeParser
	if tmp.DefaultDateTimeParser != "" {
		im.DefaultDateTimeParser = tmp.DefaultDateTimeParser
	}

	im.DefaultField = defaultField
	if tmp.DefaultField != "" {
		im.DefaultField = tmp.DefaultField
	}

	im.ByteArrayConverter = defaultByteArrayConverter
	if tmp.ByteArrayConverter != "" {
		im.ByteArrayConverter = tmp.ByteArrayConverter
	}

	im.DefaultMapping = NewDocumentMapping()
	if tmp.DefaultMapping != nil {
		im.DefaultMapping = tmp.DefaultMapping
	}

	im.TypeMapping = make(map[string]*DocumentMapping, len(tmp.TypeMapping))
	for typeName, typeDocMapping := range tmp.TypeMapping {
		im.TypeMapping[typeName] = typeDocMapping
	}

	err = im.CustomAnalysis.registerAll(im)
	if err != nil {
		return err
	}

	return nil
}
Beispiel #21
0
func TestEnglishAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		// stemming
		{
			input: []byte("books"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("book"),
					Position: 1,
					Start:    0,
					End:      5,
				},
			},
		},
		{
			input: []byte("book"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("book"),
					Position: 1,
					Start:    0,
					End:      4,
				},
			},
		},
		// stop word removal
		{
			input:  []byte("the"),
			output: analysis.TokenStream{},
		},
		// possessive removal
		{
			input: []byte("steven's"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("steven"),
					Position: 1,
					Start:    0,
					End:      8,
				},
			},
		},
		{
			input: []byte("steven\u2019s"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("steven"),
					Position: 1,
					Start:    0,
					End:      10,
				},
			},
		},
		{
			input: []byte("steven\uFF07s"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("steven"),
					Position: 1,
					Start:    0,
					End:      10,
				},
			},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %v, got %v", test.output, actual)
		}
	}
}
Beispiel #22
0
func TestFrenchAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		{
			input:  []byte(""),
			output: analysis.TokenStream{},
		},
		{
			input: []byte("chien chat cheval"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chien"),
				},
				&analysis.Token{
					Term: []byte("chat"),
				},
				&analysis.Token{
					Term: []byte("cheval"),
				},
			},
		},
		{
			input: []byte("chien CHAT CHEVAL"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chien"),
				},
				&analysis.Token{
					Term: []byte("chat"),
				},
				&analysis.Token{
					Term: []byte("cheval"),
				},
			},
		},
		{
			input: []byte("  chien  ,? + = -  CHAT /: > CHEVAL"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chien"),
				},
				&analysis.Token{
					Term: []byte("chat"),
				},
				&analysis.Token{
					Term: []byte("cheval"),
				},
			},
		},
		{
			input: []byte("chien++"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chien"),
				},
			},
		},
		{
			input: []byte("mot \"entreguillemet\""),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("mot"),
				},
				&analysis.Token{
					Term: []byte("entreguilemet"),
				},
			},
		},
		{
			input: []byte("Jean-François"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("jean"),
				},
				&analysis.Token{
					Term: []byte("francoi"),
				},
			},
		},
		// stop words
		{
			input: []byte("le la chien les aux chat du des à cheval"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chien"),
				},
				&analysis.Token{
					Term: []byte("chat"),
				},
				&analysis.Token{
					Term: []byte("cheval"),
				},
			},
		},
		// nouns and adjectives
		{
			input: []byte("lances chismes habitable chiste éléments captifs"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("lanc"),
				},
				&analysis.Token{
					Term: []byte("chism"),
				},
				&analysis.Token{
					Term: []byte("habitabl"),
				},
				&analysis.Token{
					Term: []byte("chist"),
				},
				&analysis.Token{
					Term: []byte("element"),
				},
				&analysis.Token{
					Term: []byte("captif"),
				},
			},
		},
		// verbs
		{
			input: []byte("finissions souffrirent rugissante"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("finision"),
				},
				&analysis.Token{
					Term: []byte("soufrirent"),
				},
				&analysis.Token{
					Term: []byte("rugisant"),
				},
			},
		},
		{
			input: []byte("C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ "),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("c3po"),
				},
				&analysis.Token{
					Term: []byte("aujourd'hui"),
				},
				&analysis.Token{
					Term: []byte("oeuf"),
				},
				&analysis.Token{
					Term: []byte("ïaöuaä"),
				},
				&analysis.Token{
					Term: []byte("anticonstitutionel"),
				},
				&analysis.Token{
					Term: []byte("java"),
				},
			},
		},
		{
			input: []byte("propriétaire"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("proprietair"),
				},
			},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if len(actual) != len(test.output) {
			t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
		}
		for i, tok := range actual {
			if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
				t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
			}
		}
	}
}
Beispiel #23
0
func TestWeb(t *testing.T) {

	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		{
			[]byte("Hello [email protected]"),
			analysis.TokenStream{
				{
					Start:    0,
					End:      5,
					Term:     []byte("Hello"),
					Position: 1,
					Type:     analysis.AlphaNumeric,
				},
				{
					Start:    6,
					End:      26,
					Term:     []byte("*****@*****.**"),
					Position: 2,
					Type:     analysis.AlphaNumeric,
				},
			},
		},
		{
			[]byte("That http://blevesearch.com"),
			analysis.TokenStream{
				{
					Start:    0,
					End:      4,
					Term:     []byte("That"),
					Position: 1,
					Type:     analysis.AlphaNumeric,
				},
				{
					Start:    5,
					End:      27,
					Term:     []byte("http://blevesearch.com"),
					Position: 2,
					Type:     analysis.AlphaNumeric,
				},
			},
		},
		{
			[]byte("Hey @blevesearch"),
			analysis.TokenStream{
				{
					Start:    0,
					End:      3,
					Term:     []byte("Hey"),
					Position: 1,
					Type:     analysis.AlphaNumeric,
				},
				{
					Start:    4,
					End:      16,
					Term:     []byte("@blevesearch"),
					Position: 2,
					Type:     analysis.AlphaNumeric,
				},
			},
		},
		{
			[]byte("This #bleve"),
			analysis.TokenStream{
				{
					Start:    0,
					End:      4,
					Term:     []byte("This"),
					Position: 1,
					Type:     analysis.AlphaNumeric,
				},
				{
					Start:    5,
					End:      11,
					Term:     []byte("#bleve"),
					Position: 2,
					Type:     analysis.AlphaNumeric,
				},
			},
		},
		{
			[]byte("What about @blevesearch?"),
			analysis.TokenStream{
				{
					Start:    0,
					End:      4,
					Term:     []byte("What"),
					Position: 1,
					Type:     analysis.AlphaNumeric,
				},
				{
					Start:    5,
					End:      10,
					Term:     []byte("about"),
					Position: 2,
					Type:     analysis.AlphaNumeric,
				},
				{
					Start:    11,
					End:      23,
					Term:     []byte("@blevesearch"),
					Position: 3,
					Type:     analysis.AlphaNumeric,
				},
			},
		},
	}

	cache := registry.NewCache()
	tokenizer, err := cache.TokenizerNamed(Name)
	if err != nil {
		t.Fatal(err)
	}

	for _, test := range tests {

		actual := tokenizer.Tokenize(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
		}
	}
}
Beispiel #24
0
func TestArabicAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		{
			input: []byte("كبير"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("كبير"),
					Position: 1,
					Start:    0,
					End:      8,
				},
			},
		},
		// feminine marker
		{
			input: []byte("كبيرة"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("كبير"),
					Position: 1,
					Start:    0,
					End:      10,
				},
			},
		},
		{
			input: []byte("مشروب"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("مشروب"),
					Position: 1,
					Start:    0,
					End:      10,
				},
			},
		},
		// plural -at
		{
			input: []byte("مشروبات"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("مشروب"),
					Position: 1,
					Start:    0,
					End:      14,
				},
			},
		},
		// plural -in
		{
			input: []byte("أمريكيين"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("امريك"),
					Position: 1,
					Start:    0,
					End:      16,
				},
			},
		},
		// singular with bare alif
		{
			input: []byte("امريكي"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("امريك"),
					Position: 1,
					Start:    0,
					End:      12,
				},
			},
		},
		{
			input: []byte("كتاب"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("كتاب"),
					Position: 1,
					Start:    0,
					End:      8,
				},
			},
		},
		// definite article
		{
			input: []byte("الكتاب"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("كتاب"),
					Position: 1,
					Start:    0,
					End:      12,
				},
			},
		},
		{
			input: []byte("ما ملكت أيمانكم"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("ملكت"),
					Position: 2,
					Start:    5,
					End:      13,
				},
				&analysis.Token{
					Term:     []byte("ايمانكم"),
					Position: 3,
					Start:    14,
					End:      28,
				},
			},
		},
		// stopwords
		{
			input: []byte("الذين ملكت أيمانكم"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("ملكت"),
					Position: 2,
					Start:    11,
					End:      19,
				},
				&analysis.Token{
					Term:     []byte("ايمانكم"),
					Position: 3,
					Start:    20,
					End:      34,
				},
			},
		},
		// presentation form normalization
		{
			input: []byte("ﺍﻟﺴﻼﻢ"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("سلام"),
					Position: 1,
					Start:    0,
					End:      15,
				},
			},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %v, got %v", test.output, actual)
			t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
		}
	}
}
Beispiel #25
0
func TestItalianAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		// stemming
		{
			input: []byte("abbandonata"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("abbandonat"),
				},
			},
		},
		{
			input: []byte("abbandonati"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("abbandonat"),
				},
			},
		},
		// stop word
		{
			input:  []byte("dallo"),
			output: analysis.TokenStream{},
		},
		// contractions
		{
			input: []byte("dell'Italia"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("ital"),
				},
			},
		},
		{
			input: []byte("l'Italiano"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("italian"),
				},
			},
		},
		// test for bug #218
		{
			input: []byte("Nell'anfora"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("anfor"),
				},
			},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if len(actual) != len(test.output) {
			t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
		}
		for i, tok := range actual {
			if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
				t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
			}
		}
	}
}
Beispiel #26
0
func TestPortugueseLightStemmer(t *testing.T) {
	tests := []struct {
		input  analysis.TokenStream
		output analysis.TokenStream
	}{
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("doutores"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("doutor"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("doutor"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("doutor"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("homens"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("homem"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("homem"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("homem"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("papéis"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("papel"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("papel"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("papel"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("normais"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("normal"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("normal"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("normal"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("lencóis"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("lencol"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("lencol"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("lencol"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("barris"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("barril"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("barril"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("barril"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("botões"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("bota"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("botão"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("bota"),
				},
			},
		},
		// longer
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("o"),
				},
				&analysis.Token{
					Term: []byte("debate"),
				},
				&analysis.Token{
					Term: []byte("político"),
				},
				&analysis.Token{
					Term: []byte("pelo"),
				},
				&analysis.Token{
					Term: []byte("menos"),
				},
				&analysis.Token{
					Term: []byte("o"),
				},
				&analysis.Token{
					Term: []byte("que"),
				},
				&analysis.Token{
					Term: []byte("vem"),
				},
				&analysis.Token{
					Term: []byte("a"),
				},
				&analysis.Token{
					Term: []byte("público"),
				},
				&analysis.Token{
					Term: []byte("parece"),
				},
				&analysis.Token{
					Term: []byte("de"),
				},
				&analysis.Token{
					Term: []byte("modo"),
				},
				&analysis.Token{
					Term: []byte("nada"),
				},
				&analysis.Token{
					Term: []byte("surpreendente"),
				},
				&analysis.Token{
					Term: []byte("restrito"),
				},
				&analysis.Token{
					Term: []byte("a"),
				},
				&analysis.Token{
					Term: []byte("temas"),
				},
				&analysis.Token{
					Term: []byte("menores"),
				},
				&analysis.Token{
					Term: []byte("mas"),
				},
				&analysis.Token{
					Term: []byte("há"),
				},
				&analysis.Token{
					Term: []byte("evidentemente"),
				},
				&analysis.Token{
					Term: []byte("grandes"),
				},
				&analysis.Token{
					Term: []byte("questões"),
				},
				&analysis.Token{
					Term: []byte("em"),
				},
				&analysis.Token{
					Term: []byte("jogo"),
				},
				&analysis.Token{
					Term: []byte("nas"),
				},
				&analysis.Token{
					Term: []byte("eleições"),
				},
				&analysis.Token{
					Term: []byte("que"),
				},
				&analysis.Token{
					Term: []byte("se"),
				},
				&analysis.Token{
					Term: []byte("aproximam"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("o"),
				},
				&analysis.Token{
					Term: []byte("debat"),
				},
				&analysis.Token{
					Term: []byte("politic"),
				},
				&analysis.Token{
					Term: []byte("pelo"),
				},
				&analysis.Token{
					Term: []byte("meno"),
				},
				&analysis.Token{
					Term: []byte("o"),
				},
				&analysis.Token{
					Term: []byte("que"),
				},
				&analysis.Token{
					Term: []byte("vem"),
				},
				&analysis.Token{
					Term: []byte("a"),
				},
				&analysis.Token{
					Term: []byte("public"),
				},
				&analysis.Token{
					Term: []byte("parec"),
				},
				&analysis.Token{
					Term: []byte("de"),
				},
				&analysis.Token{
					Term: []byte("modo"),
				},
				&analysis.Token{
					Term: []byte("nada"),
				},
				&analysis.Token{
					Term: []byte("surpreendent"),
				},
				&analysis.Token{
					Term: []byte("restrit"),
				},
				&analysis.Token{
					Term: []byte("a"),
				},
				&analysis.Token{
					Term: []byte("tema"),
				},
				&analysis.Token{
					Term: []byte("menor"),
				},
				&analysis.Token{
					Term: []byte("mas"),
				},
				&analysis.Token{
					Term: []byte("há"),
				},
				&analysis.Token{
					Term: []byte("evident"),
				},
				&analysis.Token{
					Term: []byte("grand"),
				},
				&analysis.Token{
					Term: []byte("questa"),
				},
				&analysis.Token{
					Term: []byte("em"),
				},
				&analysis.Token{
					Term: []byte("jogo"),
				},
				&analysis.Token{
					Term: []byte("nas"),
				},
				&analysis.Token{
					Term: []byte("eleica"),
				},
				&analysis.Token{
					Term: []byte("que"),
				},
				&analysis.Token{
					Term: []byte("se"),
				},
				&analysis.Token{
					Term: []byte("aproximam"),
				},
			},
		},
	}

	cache := registry.NewCache()
	filter, err := cache.TokenFilterNamed(LightStemmerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := filter.Filter(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
		}
	}
}
func TestEnglishPossessiveFilter(t *testing.T) {
	tests := []struct {
		input  analysis.TokenStream
		output analysis.TokenStream
	}{
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("marty's"),
				},
				&analysis.Token{
					Term: []byte("MARTY'S"),
				},
				&analysis.Token{
					Term: []byte("marty’s"),
				},
				&analysis.Token{
					Term: []byte("MARTY’S"),
				},
				&analysis.Token{
					Term: []byte("marty's"),
				},
				&analysis.Token{
					Term: []byte("MARTY'S"),
				},
				&analysis.Token{
					Term: []byte("m"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("marty"),
				},
				&analysis.Token{
					Term: []byte("MARTY"),
				},
				&analysis.Token{
					Term: []byte("marty"),
				},
				&analysis.Token{
					Term: []byte("MARTY"),
				},
				&analysis.Token{
					Term: []byte("marty"),
				},
				&analysis.Token{
					Term: []byte("MARTY"),
				},
				&analysis.Token{
					Term: []byte("m"),
				},
			},
		},
	}

	cache := registry.NewCache()
	stemmerFilter, err := cache.TokenFilterNamed(PossessiveName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := stemmerFilter.Filter(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %s, got %s", test.output, actual)
		}
	}
}
Beispiel #28
0
func newConfiguration() *configuration {
	return &configuration{
		Cache:         registry.NewCache(),
		analysisQueue: index.NewAnalysisQueue(4),
	}
}
Beispiel #29
0
func TestFrenchLightStemmer(t *testing.T) {
	tests := []struct {
		input  analysis.TokenStream
		output analysis.TokenStream
	}{
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chevaux"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("cheval"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("cheval"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("cheval"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("hiboux"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("hibou"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("hibou"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("hibou"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chantés"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chant"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chanter"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chant"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chante"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chant"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chant"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chant"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("baronnes"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("baron"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("barons"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("baron"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("baron"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("baron"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("peaux"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("peau"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("peau"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("peau"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("anneaux"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("aneau"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("anneau"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("aneau"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("neveux"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("neveu"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("neveu"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("neveu"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("affreux"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("afreu"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("affreuse"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("afreu"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("investissement"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("investi"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("investir"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("investi"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("assourdissant"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("asourdi"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("assourdir"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("asourdi"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("pratiquement"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("pratiqu"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("pratique"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("pratiqu"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("administrativement"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("administratif"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("administratif"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("administratif"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("justificatrice"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("justifi"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("justificateur"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("justifi"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("justifier"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("justifi"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("educatrice"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("eduqu"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("eduquer"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("eduqu"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("communicateur"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("comuniqu"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("communiquer"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("comuniqu"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("accompagnatrice"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("acompagn"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("accompagnateur"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("acompagn"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("administrateur"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("administr"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("administrer"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("administr"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("productrice"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("product"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("producteur"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("product"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("acheteuse"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("achet"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("acheteur"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("achet"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("planteur"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("plant"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("plante"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("plant"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("poreuse"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("poreu"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("poreux"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("poreu"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("plieuse"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("plieu"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("bijoutière"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("bijouti"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("bijoutier"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("bijouti"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("caissière"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("caisi"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("caissier"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("caisi"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("abrasive"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("abrasif"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("abrasif"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("abrasif"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("folle"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("fou"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("fou"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("fou"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("personnelle"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("person"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("personne"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("person"),
				},
			},
		},
		// algo bug: too short length
		// {
		// 	input: analysis.TokenStream{
		// 		&analysis.Token{
		// 			Term: []byte("personnel"),
		// 		},
		// 	},
		// 	output: analysis.TokenStream{
		// 		&analysis.Token{
		// 			Term: []byte("person"),
		// 		},
		// 	},
		// },
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("complète"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("complet"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("complet"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("complet"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("aromatique"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("aromat"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("faiblesse"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("faibl"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("faible"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("faibl"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("patinage"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("patin"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("patin"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("patin"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("sonorisation"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("sono"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("ritualisation"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("rituel"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("rituel"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("rituel"),
				},
			},
		},
		// algo bug: masked by rules above
		// {
		// 	input: analysis.TokenStream{
		// 		&analysis.Token{
		// 			Term: []byte("colonisateur"),
		// 		},
		// 	},
		// 	output: analysis.TokenStream{
		// 		&analysis.Token{
		// 			Term: []byte("colon"),
		// 		},
		// 	},
		// },
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("nomination"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("nomin"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("disposition"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("dispos"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("dispose"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("dispos"),
				},
			},
		},
		// SOLR-3463 : abusive compression of repeated characters in numbers
		// Trailing repeated char elision :
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("1234555"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("1234555"),
				},
			},
		},
		// Repeated char within numbers with more than 4 characters :
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("12333345"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("12333345"),
				},
			},
		},
		// Short numbers weren't affected already:
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("1234"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("1234"),
				},
			},
		},
		// Ensure behaviour is preserved for words!
		// Trailing repeated char elision :
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("abcdeff"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("abcdef"),
				},
			},
		},
		// Repeated char within words with more than 4 characters :
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("abcccddeef"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("abcdef"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("créées"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("cre"),
				},
			},
		},
		// Combined letter and digit repetition
		// 10:00pm
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("22hh00"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("22h00"),
				},
			},
		},
		// bug #214
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("propriétaire"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("proprietair"),
				},
			},
		},
	}

	cache := registry.NewCache()
	filter, err := cache.TokenFilterNamed(LightStemmerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := filter.Filter(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
		}
	}
}
Beispiel #30
0
func TestStopWordsFilter(t *testing.T) {

	inputTokenStream := analysis.TokenStream{
		&analysis.Token{
			Term:     []byte("i"),
			Start:    0,
			End:      1,
			Position: 1,
		},
		&analysis.Token{
			Term:     []byte("like"),
			Start:    2,
			End:      6,
			Position: 2,
		},
		&analysis.Token{
			Term:     []byte("to"),
			Start:    7,
			End:      9,
			Position: 3,
		},
		&analysis.Token{
			Term:     []byte("play"),
			Start:    10,
			End:      14,
			Position: 4,
		},
		&analysis.Token{
			Term:     []byte("softball"),
			Start:    15,
			End:      23,
			Position: 5,
		},
	}

	expectedTokenStream := analysis.TokenStream{
		&analysis.Token{
			Term:     []byte("i"),
			Start:    0,
			End:      1,
			Position: 1,
		},
		&analysis.Token{
			Term:     []byte("like"),
			Start:    2,
			End:      6,
			Position: 2,
		},
		&analysis.Token{
			Term:     []byte("to"),
			Start:    7,
			End:      9,
			Position: 3,
		},
		&analysis.Token{
			Term:     []byte("play"),
			Start:    10,
			End:      14,
			Position: 4,
		},
		&analysis.Token{
			Term:     []byte("softball"),
			Start:    15,
			End:      23,
			Position: 5,
		},
		&analysis.Token{
			Term:     []byte("soft"),
			Start:    15,
			End:      19,
			Position: 5,
		},
		&analysis.Token{
			Term:     []byte("ball"),
			Start:    19,
			End:      23,
			Position: 5,
		},
	}

	cache := registry.NewCache()
	dictListConfig := map[string]interface{}{
		"type":   token_map.Name,
		"tokens": []interface{}{"factor", "soft", "ball", "team"},
	}
	_, err := cache.DefineTokenMap("dict_test", dictListConfig)
	if err != nil {
		t.Fatal(err)
	}

	dictConfig := map[string]interface{}{
		"type":           "dict_compound",
		"dict_token_map": "dict_test",
	}
	dictFilter, err := cache.DefineTokenFilter("dict_test", dictConfig)
	if err != nil {
		t.Fatal(err)
	}

	ouputTokenStream := dictFilter.Filter(inputTokenStream)
	if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
		t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
	}
}