func BenchmarkTokenizeEnglishText(b *testing.B) {

	tokenizer := regexp_tokenizer.NewRegexpTokenizer(whitespaceTokenizerRegexp)
	b.ResetTimer()

	for i := 0; i < b.N; i++ {
		tokenizer.Tokenize(sampleLargeInput)
	}

}
Beispiel #2
0
	}
	err = twoDocIndex.Open()
	if err != nil {
		panic(err)
	}
	for _, doc := range twoDocIndexDocs {
		err := twoDocIndex.Update(doc)
		if err != nil {
			panic(err)
		}
	}
}

// create a simpler analyzer which will support these tests
var testAnalyzer = &analysis.Analyzer{
	Tokenizer: regexp_tokenizer.NewRegexpTokenizer(regexp.MustCompile(`\w+`)),
}

// sets up some mock data used in many tests in this package
var twoDocIndexDescIndexingOptions = document.DefaultTextIndexingOptions | document.IncludeTermVectors

var twoDocIndexDocs = []*document.Document{
	// must have 4/4 beer
	document.NewDocument("1").
		AddField(document.NewTextField("name", []uint64{}, []byte("marty"))).
		AddField(document.NewTextFieldCustom("desc", []uint64{}, []byte("beer beer beer beer"), twoDocIndexDescIndexingOptions, testAnalyzer)).
		AddField(document.NewTextFieldWithAnalyzer("street", []uint64{}, []byte("couchbase way"), testAnalyzer)),
	// must have 1/4 beer
	document.NewDocument("2").
		AddField(document.NewTextField("name", []uint64{}, []byte("steve"))).
		AddField(document.NewTextFieldCustom("desc", []uint64{}, []byte("angst beer couch database"), twoDocIndexDescIndexingOptions, testAnalyzer)).
func TestBoundary(t *testing.T) {

	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		{
			[]byte("Hello World."),
			analysis.TokenStream{
				{
					Start:    0,
					End:      5,
					Term:     []byte("Hello"),
					Position: 1,
					Type:     analysis.AlphaNumeric,
				},
				{
					Start:    6,
					End:      11,
					Term:     []byte("World"),
					Position: 2,
					Type:     analysis.AlphaNumeric,
				},
			},
		},
		{
			[]byte("こんにちは世界"),
			analysis.TokenStream{
				{
					Start:    0,
					End:      3,
					Term:     []byte("こ"),
					Position: 1,
					Type:     analysis.Ideographic,
				},
				{
					Start:    3,
					End:      6,
					Term:     []byte("ん"),
					Position: 2,
					Type:     analysis.Ideographic,
				},
				{
					Start:    6,
					End:      9,
					Term:     []byte("に"),
					Position: 3,
					Type:     analysis.Ideographic,
				},
				{
					Start:    9,
					End:      12,
					Term:     []byte("ち"),
					Position: 4,
					Type:     analysis.Ideographic,
				},
				{
					Start:    12,
					End:      15,
					Term:     []byte("は"),
					Position: 5,
					Type:     analysis.Ideographic,
				},
				{
					Start:    15,
					End:      18,
					Term:     []byte("世"),
					Position: 6,
					Type:     analysis.Ideographic,
				},
				{
					Start:    18,
					End:      21,
					Term:     []byte("界"),
					Position: 7,
					Type:     analysis.Ideographic,
				},
			},
		},
		{
			[]byte(""),
			analysis.TokenStream{},
		},
		{
			[]byte("abc界"),
			analysis.TokenStream{
				{
					Start:    0,
					End:      3,
					Term:     []byte("abc"),
					Position: 1,
					Type:     analysis.AlphaNumeric,
				},
				{
					Start:    3,
					End:      6,
					Term:     []byte("界"),
					Position: 2,
					Type:     analysis.Ideographic,
				},
			},
		},
	}

	for _, test := range tests {
		tokenizer := regexp_tokenizer.NewRegexpTokenizer(whitespaceTokenizerRegexp)
		actual := tokenizer.Tokenize(test.input)

		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
		}
	}
}
Beispiel #4
0
func TokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
	return regexp_tokenizer.NewRegexpTokenizer(whitespaceTokenizerRegexp), nil
}