Esempio n. 1
0
func CommonBenchmarkIndexBatch(b *testing.B, create KVStoreCreate, destroy KVStoreDestroy, analysisWorkers, batchSize int) {

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed("standard")
	if err != nil {
		b.Fatal(err)
	}

	b.ResetTimer()
	b.StopTimer()
	for i := 0; i < b.N; i++ {

		s, err := create()
		if err != nil {
			b.Fatal(err)
		}
		analysisQueue := index.NewAnalysisQueue(analysisWorkers)
		idx := NewUpsideDownCouch(s, analysisQueue)

		err = idx.Open()
		if err != nil {
			b.Fatal(err)
		}

		b.StartTimer()
		batch := index.NewBatch()
		for j := 0; j < 1000; j++ {
			if j%batchSize == 0 {
				if len(batch.IndexOps) > 0 {
					err := idx.Batch(batch)
					if err != nil {
						b.Fatal(err)
					}
				}
				batch = index.NewBatch()
			}
			indexDocument := document.NewDocument("").
				AddField(document.NewTextFieldWithAnalyzer("body", []uint64{}, []byte(benchmarkDocBodies[j%10]), analyzer))
			indexDocument.ID = strconv.Itoa(i) + "-" + strconv.Itoa(j)
			batch.Update(indexDocument)
		}
		// close last batch
		if len(batch.IndexOps) > 0 {
			err := idx.Batch(batch)
			if err != nil {
				b.Fatal(err)
			}
		}
		b.StopTimer()
		err = idx.Close()
		if err != nil {
			b.Fatal(err)
		}
		err = destroy()
		if err != nil {
			b.Fatal(err)
		}
		analysisQueue.Close()
	}
}
Esempio n. 2
0
func BenchmarkBatch(b *testing.B) {

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name)
	if err != nil {
		b.Fatal(err)
	}

	analysisQueue := index.NewAnalysisQueue(1)
	idx, err := NewUpsideDownCouch(null.Name, nil, analysisQueue)
	if err != nil {
		b.Fatal(err)
	}
	err = idx.Open()
	if err != nil {
		b.Fatal(err)
	}

	batch := index.NewBatch()
	for i := 0; i < 100; i++ {
		d := document.NewDocument(strconv.Itoa(i))
		f := document.NewTextFieldWithAnalyzer("desc", nil, bleveWikiArticle1K, analyzer)
		d.AddField(f)
		batch.Update(d)
	}

	b.ResetTimer()

	for i := 0; i < b.N; i++ {
		err = idx.Batch(batch)
		if err != nil {
			b.Fatal(err)
		}
	}
}
Esempio n. 3
0
func TestElisionFilter(t *testing.T) {

	tests := []struct {
		input  analysis.TokenStream
		output analysis.TokenStream
	}{
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("ar" + string(Apostrophe) + "word"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("word"),
				},
			},
		},
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("ar" + string(RightSingleQuotationMark) + "word"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("word"),
				},
			},
		},
	}

	cache := registry.NewCache()

	articleListConfig := map[string]interface{}{
		"type":   token_map.Name,
		"tokens": []interface{}{"ar"},
	}
	_, err := cache.DefineTokenMap("articles_test", articleListConfig)
	if err != nil {
		t.Fatal(err)
	}

	elisionConfig := map[string]interface{}{
		"type":               "elision",
		"articles_token_map": "articles_test",
	}
	elisionFilter, err := cache.DefineTokenFilter("elision_test", elisionConfig)
	if err != nil {
		t.Fatal(err)
	}

	for _, test := range tests {

		actual := elisionFilter.Filter(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
		}
	}
}
Esempio n. 4
0
func TestItalianElision(t *testing.T) {
	tests := []struct {
		input  analysis.TokenStream
		output analysis.TokenStream
	}{
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("dell'Italia"),
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("Italia"),
				},
			},
		},
	}

	cache := registry.NewCache()
	elisionFilter, err := cache.TokenFilterNamed(ElisionName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := elisionFilter.Filter(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
		}
	}
}
Esempio n. 5
0
func BenchmarkAnalyze(b *testing.B) {

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name)
	if err != nil {
		b.Fatal(err)
	}

	analysisQueue := index.NewAnalysisQueue(1)
	idx, err := NewFirestorm(null.Name, nil, analysisQueue)
	if err != nil {
		b.Fatal(err)
	}

	d := document.NewDocument("1")
	f := document.NewTextFieldWithAnalyzer("desc", nil, bleveWikiArticle1K, analyzer)
	d.AddField(f)

	b.ResetTimer()

	for i := 0; i < b.N; i++ {
		rv := idx.Analyze(d)
		if len(rv.Rows) < 92 || len(rv.Rows) > 93 {
			b.Fatalf("expected 512-13 rows, got %d", len(rv.Rows))
		}
	}
}
Esempio n. 6
0
func TestStopWordsFilterLongestMatch(t *testing.T) {

	inputTokenStream := analysis.TokenStream{
		&analysis.Token{
			Term:     []byte("softestball"),
			Start:    0,
			End:      11,
			Position: 1,
		},
	}

	expectedTokenStream := analysis.TokenStream{
		&analysis.Token{
			Term:     []byte("softestball"),
			Start:    0,
			End:      11,
			Position: 1,
		},
		&analysis.Token{
			Term:     []byte("softest"),
			Start:    0,
			End:      7,
			Position: 1,
		},
		&analysis.Token{
			Term:     []byte("ball"),
			Start:    7,
			End:      11,
			Position: 1,
		},
	}

	cache := registry.NewCache()
	dictListConfig := map[string]interface{}{
		"type":   token_map.Name,
		"tokens": []interface{}{"soft", "softest", "ball"},
	}
	_, err := cache.DefineTokenMap("dict_test", dictListConfig)
	if err != nil {
		t.Fatal(err)
	}

	dictConfig := map[string]interface{}{
		"type":               "dict_compound",
		"dict_token_map":     "dict_test",
		"only_longest_match": true,
	}
	dictFilter, err := cache.DefineTokenFilter("dict_test", dictConfig)
	if err != nil {
		t.Fatal(err)
	}

	ouputTokenStream := dictFilter.Filter(inputTokenStream)
	if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
		t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
	}
}
Esempio n. 7
0
func BenchmarkCJKAnalyzer(b *testing.B) {
	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		b.Fatal(err)
	}

	for i := 0; i < b.N; i++ {
		analyzer.Analyze(bleveWikiArticleJapanese)
	}
}
Esempio n. 8
0
func TestSoraniAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		// stop word removal
		{
			input: []byte("ئەم پیاوە"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("پیاو"),
					Position: 2,
					Start:    7,
					End:      17,
				},
			},
		},
		{
			input: []byte("پیاوە"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("پیاو"),
					Position: 1,
					Start:    0,
					End:      10,
				},
			},
		},
		{
			input: []byte("پیاو"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("پیاو"),
					Position: 1,
					Start:    0,
					End:      8,
				},
			},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %v, got %v", test.output, actual)
		}
	}
}
Esempio n. 9
0
func TestStopWordsFilter(t *testing.T) {

	inputTokenStream := analysis.TokenStream{
		&analysis.Token{
			Term: []byte("a"),
		},
		&analysis.Token{
			Term: []byte("walk"),
		},
		&analysis.Token{
			Term: []byte("in"),
		},
		&analysis.Token{
			Term: []byte("the"),
		},
		&analysis.Token{
			Term: []byte("park"),
		},
	}

	expectedTokenStream := analysis.TokenStream{
		&analysis.Token{
			Term: []byte("walk"),
		},
		&analysis.Token{
			Term: []byte("park"),
		},
	}

	cache := registry.NewCache()
	stopListConfig := map[string]interface{}{
		"type":   token_map.Name,
		"tokens": []interface{}{"a", "in", "the"},
	}
	_, err := cache.DefineTokenMap("stop_test", stopListConfig)
	if err != nil {
		t.Fatal(err)
	}

	stopConfig := map[string]interface{}{
		"type":           "stop_tokens",
		"stop_token_map": "stop_test",
	}
	stopFilter, err := cache.DefineTokenFilter("stop_test", stopConfig)
	if err != nil {
		t.Fatal(err)
	}

	ouputTokenStream := stopFilter.Filter(inputTokenStream)
	if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
		t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
	}
}
Esempio n. 10
0
func TestEnglishStemmer(t *testing.T) {
	tests := []struct {
		input  analysis.TokenStream
		output analysis.TokenStream
	}{
		{
			input: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("walking"),
				},
				&analysis.Token{
					Term: []byte("talked"),
				},
				&analysis.Token{
					Term: []byte("business"),
				},
				&analysis.Token{
					Term:    []byte("protected"),
					KeyWord: true,
				},
			},
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("walk"),
				},
				&analysis.Token{
					Term: []byte("talk"),
				},
				&analysis.Token{
					Term: []byte("busi"),
				},
				&analysis.Token{
					Term:    []byte("protected"),
					KeyWord: true,
				},
			},
		},
	}

	cache := registry.NewCache()
	stemmerFilter, err := cache.TokenFilterNamed(StemmerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := stemmerFilter.Filter(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %s, got %s", test.output, actual)
		}
	}
}
Esempio n. 11
0
func TestThaiAnalyzerWihtoutOffsets(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		// stop words
		{
			input: []byte("บริษัทชื่อ XY&Z - คุยกับ [email protected]"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("บริษัท"),
				},
				&analysis.Token{
					Term: []byte("ชื่อ"),
				},
				&analysis.Token{
					Term: []byte("xy"),
				},
				&analysis.Token{
					Term: []byte("z"),
				},
				&analysis.Token{
					Term: []byte("คุย"),
				},
				&analysis.Token{
					Term: []byte("xyz"),
				},
				&analysis.Token{
					Term: []byte("demo.com"),
				},
			},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if len(actual) != len(test.output) {
			t.Errorf("expected length: %d, got %d", len(test.output), len(actual))
		}
		for i, tok := range actual {
			if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
				t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
			}
		}
	}
}
Esempio n. 12
0
// NewIndexMapping creates a new IndexMapping that will use all the default indexing rules
func NewIndexMapping() *IndexMapping {
	return &IndexMapping{
		TypeMapping:           make(map[string]*DocumentMapping),
		DefaultMapping:        NewDocumentMapping(),
		TypeField:             defaultTypeField,
		DefaultType:           defaultType,
		DefaultAnalyzer:       defaultAnalyzer,
		DefaultDateTimeParser: defaultDateTimeParser,
		DefaultField:          defaultField,
		ByteArrayConverter:    defaultByteArrayConverter,
		CustomAnalysis:        newCustomAnalysis(),
		cache:                 registry.NewCache(),
	}
}
Esempio n. 13
0
func TestJaAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		{
			input: []byte("こんにちは世界"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("こんにちは"),
					Type:     analysis.Ideographic,
					Position: 1,
					Start:    0,
					End:      15,
				},
				&analysis.Token{
					Term:     []byte("世界"),
					Type:     analysis.Ideographic,
					Position: 2,
					Start:    15,
					End:      21,
				},
			},
		},
		{
			input: []byte("カタカナ"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("カタカナ"),
					Type:     analysis.Ideographic,
					Position: 1,
					Start:    0,
					End:      12,
				},
			},
		},
	}

	cache := registry.NewCache()
	for _, test := range tests {
		analyzer, err := cache.AnalyzerNamed(AnalyzerName)
		if err != nil {
			t.Fatal(err)
		}
		actual := analyzer.Analyze(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %v, got %v", test.output, actual)
		}
	}
}
Esempio n. 14
0
// NewIndexMapping creates a new IndexMapping that will use all the default indexing rules
func NewIndexMapping() *IndexMapping {
	return &IndexMapping{
		TypeMapping:           make(map[string]*DocumentMapping),
		DefaultMapping:        NewDocumentMapping(),
		TypeField:             defaultTypeField,
		DefaultType:           defaultType,
		DefaultAnalyzer:       defaultAnalyzer,
		DefaultDateTimeParser: defaultDateTimeParser,
		DefaultField:          defaultField,
		IndexDynamic:          IndexDynamic,
		StoreDynamic:          StoreDynamic,
		CustomAnalysis:        newCustomAnalysis(),
		cache:                 registry.NewCache(),
	}
}
Esempio n. 15
0
func TestPortugueseAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		// stemming
		// fails due to stemming discrepencies
		// got quilométr instead of quilometric
		// {
		// 	input: []byte("quilométricas"),
		// 	output: analysis.TokenStream{
		// 		&analysis.Token{
		// 			Term: []byte("quilometric"),
		// 		},
		// 	},
		// },
		// {
		// 	input: []byte("quilométricos"),
		// 	output: analysis.TokenStream{
		// 		&analysis.Token{
		// 			Term: []byte("quilometric"),
		// 		},
		// 	},
		// },
		// stop word
		{
			input:  []byte("não"),
			output: analysis.TokenStream{},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if len(actual) != len(test.output) {
			t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
		}
		for i, tok := range actual {
			if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
				t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
			}
		}
	}
}
Esempio n. 16
0
func TestItalianAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		// stemming
		// fails, stemming discrepencies
		// abbandon intead of abbandonat
		// {
		// 	input: []byte("abbandonata"),
		// 	output: analysis.TokenStream{
		// 		&analysis.Token{
		// 			Term: []byte("abbandonat"),
		// 		},
		// 	},
		// },
		// {
		// 	input: []byte("abbandonati"),
		// 	output: analysis.TokenStream{
		// 		&analysis.Token{
		// 			Term: []byte("abbandonat"),
		// 		},
		// 	},
		// },
		// stop word
		{
			input:  []byte("dallo"),
			output: analysis.TokenStream{},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if len(actual) != len(test.output) {
			t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
		}
		for i, tok := range actual {
			if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
				t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
			}
		}
	}
}
Esempio n. 17
0
func BenchmarkAnalysis(b *testing.B) {
	for i := 0; i < b.N; i++ {

		cache := registry.NewCache()
		analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name)
		if err != nil {
			b.Fatal(err)
		}

		ts := analyzer.Analyze(bleveWikiArticle)
		freqs := analysis.TokenFrequency(ts, nil, true)
		if len(freqs) != 511 {
			b.Errorf("expected %d freqs, got %d", 511, len(freqs))
		}
	}
}
Esempio n. 18
0
func TestDanishAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		// stemming
		{
			input: []byte("undersøg"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("undersøg"),
					Position: 1,
					Start:    0,
					End:      9,
				},
			},
		},
		{
			input: []byte("undersøgelse"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("undersøg"),
					Position: 1,
					Start:    0,
					End:      13,
				},
			},
		},
		// stop word
		{
			input:  []byte("på"),
			output: analysis.TokenStream{},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %v, got %v", test.output, actual)
		}
	}
}
Esempio n. 19
0
func TestHungarianAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		// stemming
		{
			input: []byte("babakocsi"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("babakocs"),
				},
			},
		},
		{
			input: []byte("babakocsijáért"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("babakocs"),
				},
			},
		},
		// stop word
		{
			input:  []byte("által"),
			output: analysis.TokenStream{},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if len(actual) != len(test.output) {
			t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
		}
		for i, tok := range actual {
			if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
				t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
			}
		}
	}
}
Esempio n. 20
0
func BenchmarkStopWordsFilter(b *testing.B) {

	inputTokenStream := analysis.TokenStream{
		&analysis.Token{
			Term: []byte("a"),
		},
		&analysis.Token{
			Term: []byte("walk"),
		},
		&analysis.Token{
			Term: []byte("in"),
		},
		&analysis.Token{
			Term: []byte("the"),
		},
		&analysis.Token{
			Term: []byte("park"),
		},
	}

	cache := registry.NewCache()
	stopListConfig := map[string]interface{}{
		"type":   token_map.Name,
		"tokens": []interface{}{"a", "in", "the"},
	}
	_, err := cache.DefineTokenMap("stop_test", stopListConfig)
	if err != nil {
		b.Fatal(err)
	}

	stopConfig := map[string]interface{}{
		"type":           "stop_tokens",
		"stop_token_map": "stop_test",
	}
	stopFilter, err := cache.DefineTokenFilter("stop_test", stopConfig)
	if err != nil {
		b.Fatal(err)
	}
	b.ResetTimer()

	for i := 0; i < b.N; i++ {
		stopFilter.Filter(inputTokenStream)
	}

}
Esempio n. 21
0
func CommonBenchmarkIndex(b *testing.B, create KVStoreCreate, destroy KVStoreDestroy, analysisWorkers int) {

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed("standard")
	if err != nil {
		b.Fatal(err)
	}

	indexDocument := document.NewDocument("").
		AddField(document.NewTextFieldWithAnalyzer("body", []uint64{}, []byte(benchmarkDocBodies[0]), analyzer))

	b.ResetTimer()
	b.StopTimer()
	for i := 0; i < b.N; i++ {
		s, err := create()
		if err != nil {
			b.Fatal(err)
		}
		analysisQueue := index.NewAnalysisQueue(analysisWorkers)
		idx := NewUpsideDownCouch(s, analysisQueue)

		err = idx.Open()
		if err != nil {
			b.Fatal(err)
		}
		indexDocument.ID = strconv.Itoa(i)
		// just time the indexing portion
		b.StartTimer()
		err = idx.Update(indexDocument)
		if err != nil {
			b.Fatal(err)
		}
		b.StopTimer()
		err = idx.Close()
		if err != nil {
			b.Fatal(err)
		}
		err = destroy()
		if err != nil {
			b.Fatal(err)
		}
		analysisQueue.Close()
	}
}
Esempio n. 22
0
func TestThaiAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		// stop words
		{
			input: []byte("การที่ได้ต้องแสดงว่างานดี"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("แสดง"),
					Position: 5,
					Start:    39,
					End:      51,
				},
				&analysis.Token{
					Term:     []byte("งาน"),
					Position: 7,
					Start:    60,
					End:      69,
				},
				&analysis.Token{
					Term:     []byte("ดี"),
					Position: 8,
					Start:    69,
					End:      75,
				},
			},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %v, got %v", test.output, actual)
		}
	}
}
Esempio n. 23
0
func TestHindiAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		// two ways to write 'hindi' itself
		{
			input: []byte("हिन्दी"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("हिंद"),
					Position: 1,
					Start:    0,
					End:      18,
				},
			},
		},
		{
			input: []byte("हिंदी"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("हिंद"),
					Position: 1,
					Start:    0,
					End:      15,
				},
			},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %v, got %v", test.output, actual)
		}
	}
}
Esempio n. 24
0
func CommonBenchmarkIndex(b *testing.B, s store.KVStore, analysisWorkers int) {
	analysisQueue := NewAnalysisQueue(analysisWorkers)
	idx := NewUpsideDownCouch(s, analysisQueue)

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed("standard")
	if err != nil {
		b.Fatal(err)
	}

	indexDocument := document.NewDocument("").
		AddField(document.NewTextFieldWithAnalyzer("body", []uint64{}, []byte(benchmarkDocBodies[0]), analyzer))

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		indexDocument.ID = strconv.Itoa(i)
		err := idx.Update(indexDocument)
		if err != nil {
			b.Fatal(err)
		}
	}
}
Esempio n. 25
0
func CommonBenchmarkIndexBatch(b *testing.B, s store.KVStore, analysisWorkers, batchSize int) {
	analysisQueue := NewAnalysisQueue(analysisWorkers)
	idx := NewUpsideDownCouch(s, analysisQueue)

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed("standard")
	if err != nil {
		b.Fatal(err)
	}

	b.ResetTimer()
	for i := 0; i < b.N; i++ {

		var batch index.Batch
		for j := 0; j < 1000; j++ {
			if j%batchSize == 0 {
				if len(batch) > 0 {
					err := idx.Batch(batch)
					if err != nil {
						b.Fatal(err)
					}
				}
				batch = make(index.Batch)
			}
			indexDocument := document.NewDocument("").
				AddField(document.NewTextFieldWithAnalyzer("body", []uint64{}, []byte(benchmarkDocBodies[j%10]), analyzer))
			indexDocument.ID = strconv.Itoa(i) + "-" + strconv.Itoa(j)
			batch[indexDocument.ID] = indexDocument
		}
		// close last batch
		if len(batch) > 0 {
			err := idx.Batch(batch)
			if err != nil {
				b.Fatal(err)
			}
		}

	}
}
Esempio n. 26
0
func TestAnalysisBug328(t *testing.T) {
	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name)
	if err != nil {
		t.Fatal(err)
	}

	analysisQueue := index.NewAnalysisQueue(1)
	idx, err := NewFirestorm(gtreap.Name, nil, analysisQueue)
	if err != nil {
		t.Fatal(err)
	}

	d := document.NewDocument("1")
	f := document.NewTextFieldCustom("title", nil, []byte("bleve"), document.IndexField|document.IncludeTermVectors, analyzer)
	d.AddField(f)
	f = document.NewTextFieldCustom("body", nil, []byte("bleve"), document.IndexField|document.IncludeTermVectors, analyzer)
	d.AddField(f)
	cf := document.NewCompositeFieldWithIndexingOptions("_all", true, []string{}, []string{}, document.IndexField|document.IncludeTermVectors)
	d.AddField(cf)

	rv := idx.Analyze(d)
	fieldIndexes := make(map[uint16]string)
	for _, row := range rv.Rows {
		if row, ok := row.(*FieldRow); ok {
			fieldIndexes[row.index] = row.Name()
		}
		if row, ok := row.(*TermFreqRow); ok && string(row.term) == "bleve" {
			for _, vec := range row.Vectors() {
				if vec.GetField() != uint32(row.field) {
					if fieldIndexes[row.field] != "_all" {
						t.Errorf("row named %s field %d - vector field %d", fieldIndexes[row.field], row.field, vec.GetField())
					}
				}
			}
		}
	}
}
Esempio n. 27
0
func BenchmarkEnglishPossessiveFilter(b *testing.B) {

	input := analysis.TokenStream{
		&analysis.Token{
			Term: []byte("marty's"),
		},
		&analysis.Token{
			Term: []byte("MARTY'S"),
		},
		&analysis.Token{
			Term: []byte("marty’s"),
		},
		&analysis.Token{
			Term: []byte("MARTY’S"),
		},
		&analysis.Token{
			Term: []byte("marty's"),
		},
		&analysis.Token{
			Term: []byte("MARTY'S"),
		},
		&analysis.Token{
			Term: []byte("m"),
		},
	}

	cache := registry.NewCache()
	stemmerFilter, err := cache.TokenFilterNamed(PossessiveName)
	if err != nil {
		b.Fatal(err)
	}
	b.ResetTimer()

	for i := 0; i < b.N; i++ {
		stemmerFilter.Filter(input)
	}

}
Esempio n. 28
0
// UnmarshalJSON offers custom unmarshaling with optional strict validation
func (im *IndexMapping) UnmarshalJSON(data []byte) error {

	var tmp map[string]json.RawMessage
	err := json.Unmarshal(data, &tmp)
	if err != nil {
		return err
	}

	// set defaults for fields which might have been omitted
	im.cache = registry.NewCache()
	im.CustomAnalysis = newCustomAnalysis()
	im.TypeField = defaultTypeField
	im.DefaultType = defaultType
	im.DefaultAnalyzer = defaultAnalyzer
	im.DefaultDateTimeParser = defaultDateTimeParser
	im.DefaultField = defaultField
	im.ByteArrayConverter = defaultByteArrayConverter
	im.DefaultMapping = NewDocumentMapping()
	im.TypeMapping = make(map[string]*DocumentMapping)
	im.StoreDynamic = StoreDynamic
	im.IndexDynamic = IndexDynamic

	var invalidKeys []string
	for k, v := range tmp {
		switch k {
		case "analysis":
			err := json.Unmarshal(v, &im.CustomAnalysis)
			if err != nil {
				return err
			}
		case "type_field":
			err := json.Unmarshal(v, &im.TypeField)
			if err != nil {
				return err
			}
		case "default_type":
			err := json.Unmarshal(v, &im.DefaultType)
			if err != nil {
				return err
			}
		case "default_analyzer":
			err := json.Unmarshal(v, &im.DefaultAnalyzer)
			if err != nil {
				return err
			}
		case "default_datetime_parser":
			err := json.Unmarshal(v, &im.DefaultDateTimeParser)
			if err != nil {
				return err
			}
		case "default_field":
			err := json.Unmarshal(v, &im.DefaultField)
			if err != nil {
				return err
			}
		case "byte_array_converter":
			err := json.Unmarshal(v, &im.ByteArrayConverter)
			if err != nil {
				return err
			}
		case "default_mapping":
			err := json.Unmarshal(v, &im.DefaultMapping)
			if err != nil {
				return err
			}
		case "types":
			err := json.Unmarshal(v, &im.TypeMapping)
			if err != nil {
				return err
			}
		case "store_dynamic":
			err := json.Unmarshal(v, &im.StoreDynamic)
			if err != nil {
				return err
			}
		case "index_dynamic":
			err := json.Unmarshal(v, &im.IndexDynamic)
			if err != nil {
				return err
			}
		default:
			invalidKeys = append(invalidKeys, k)
		}
	}

	if MappingJSONStrict && len(invalidKeys) > 0 {
		return fmt.Errorf("index mapping contains invalid keys: %v", invalidKeys)
	}

	err = im.CustomAnalysis.registerAll(im)
	if err != nil {
		return err
	}

	return nil
}
Esempio n. 29
0
func TestArabicAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		{
			input: []byte("كبير"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("كبير"),
					Position: 1,
					Start:    0,
					End:      8,
				},
			},
		},
		// feminine marker
		{
			input: []byte("كبيرة"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("كبير"),
					Position: 1,
					Start:    0,
					End:      10,
				},
			},
		},
		{
			input: []byte("مشروب"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("مشروب"),
					Position: 1,
					Start:    0,
					End:      10,
				},
			},
		},
		// plural -at
		{
			input: []byte("مشروبات"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("مشروب"),
					Position: 1,
					Start:    0,
					End:      14,
				},
			},
		},
		// plural -in
		{
			input: []byte("أمريكيين"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("امريك"),
					Position: 1,
					Start:    0,
					End:      16,
				},
			},
		},
		// singular with bare alif
		{
			input: []byte("امريكي"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("امريك"),
					Position: 1,
					Start:    0,
					End:      12,
				},
			},
		},
		{
			input: []byte("كتاب"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("كتاب"),
					Position: 1,
					Start:    0,
					End:      8,
				},
			},
		},
		// definite article
		{
			input: []byte("الكتاب"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("كتاب"),
					Position: 1,
					Start:    0,
					End:      12,
				},
			},
		},
		{
			input: []byte("ما ملكت أيمانكم"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("ملكت"),
					Position: 2,
					Start:    5,
					End:      13,
				},
				&analysis.Token{
					Term:     []byte("ايمانكم"),
					Position: 3,
					Start:    14,
					End:      28,
				},
			},
		},
		// stopwords
		{
			input: []byte("الذين ملكت أيمانكم"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("ملكت"),
					Position: 2,
					Start:    11,
					End:      19,
				},
				&analysis.Token{
					Term:     []byte("ايمانكم"),
					Position: 3,
					Start:    20,
					End:      34,
				},
			},
		},
		// presentation form normalization
		{
			input: []byte("ﺍﻟﺴﻼﻢ"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("سلام"),
					Position: 1,
					Start:    0,
					End:      15,
				},
			},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("expected %v, got %v", test.output, actual)
			t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
		}
	}
}
Esempio n. 30
0
func TestFrenchAnalyzer(t *testing.T) {
	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		{
			input:  []byte(""),
			output: analysis.TokenStream{},
		},
		{
			input: []byte("chien chat cheval"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chien"),
				},
				&analysis.Token{
					Term: []byte("chat"),
				},
				&analysis.Token{
					Term: []byte("cheval"),
				},
			},
		},
		{
			input: []byte("chien CHAT CHEVAL"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chien"),
				},
				&analysis.Token{
					Term: []byte("chat"),
				},
				&analysis.Token{
					Term: []byte("cheval"),
				},
			},
		},
		{
			input: []byte("  chien  ,? + = -  CHAT /: > CHEVAL"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chien"),
				},
				&analysis.Token{
					Term: []byte("chat"),
				},
				&analysis.Token{
					Term: []byte("cheval"),
				},
			},
		},
		{
			input: []byte("chien++"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chien"),
				},
			},
		},
		{
			input: []byte("mot \"entreguillemet\""),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("mot"),
				},
				&analysis.Token{
					Term: []byte("entreguilemet"),
				},
			},
		},
		{
			input: []byte("Jean-François"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("jean"),
				},
				&analysis.Token{
					Term: []byte("francoi"),
				},
			},
		},
		// stop words
		{
			input: []byte("le la chien les aux chat du des à cheval"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("chien"),
				},
				&analysis.Token{
					Term: []byte("chat"),
				},
				&analysis.Token{
					Term: []byte("cheval"),
				},
			},
		},
		// nouns and adjectives
		{
			input: []byte("lances chismes habitable chiste éléments captifs"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("lanc"),
				},
				&analysis.Token{
					Term: []byte("chism"),
				},
				&analysis.Token{
					Term: []byte("habitabl"),
				},
				&analysis.Token{
					Term: []byte("chist"),
				},
				&analysis.Token{
					Term: []byte("element"),
				},
				&analysis.Token{
					Term: []byte("captif"),
				},
			},
		},
		// verbs
		{
			input: []byte("finissions souffrirent rugissante"),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("finision"),
				},
				&analysis.Token{
					Term: []byte("soufrirent"),
				},
				&analysis.Token{
					Term: []byte("rugisant"),
				},
			},
		},
		{
			input: []byte("C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ "),
			output: analysis.TokenStream{
				&analysis.Token{
					Term: []byte("c3po"),
				},
				&analysis.Token{
					Term: []byte("aujourd'hui"),
				},
				&analysis.Token{
					Term: []byte("oeuf"),
				},
				&analysis.Token{
					Term: []byte("ïaöuaä"),
				},
				&analysis.Token{
					Term: []byte("anticonstitutionel"),
				},
				&analysis.Token{
					Term: []byte("java"),
				},
			},
		},
	}

	cache := registry.NewCache()
	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
	if err != nil {
		t.Fatal(err)
	}
	for _, test := range tests {
		actual := analyzer.Analyze(test.input)
		if len(actual) != len(test.output) {
			t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
		}
		for i, tok := range actual {
			if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
				t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
			}
		}
	}
}