func TestLookupWithPartialLocations(t *testing.T) { var indexer Indexer indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex}) // doc0 = "token2 token4 token4 token2 token3 token4" + "label1"(不在文本中) indexer.AddDocument(&types.DocumentIndex{ DocId: 0, Keywords: []types.KeywordIndex{ {"token2", 0, []int{0, 21}}, {"token3", 0, []int{28}}, {"label1", 0, []int{}}, {"token4", 0, []int{7, 14, 35}}, }, }) // doc1 = "token2 token4 token4 token2 token3 token4" indexer.AddDocument(&types.DocumentIndex{ DocId: 1, Keywords: []types.KeywordIndex{ {"token2", 0, []int{0, 21}}, {"token3", 0, []int{28}}, {"token4", 0, []int{7, 14, 35}}, }, }) utils.Expect(t, "0 ", indicesToString(&indexer, "label1")) utils.Expect(t, "[0 1 [21 28]] ", indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{"label1"}, nil, false))) }
func TestEngineIndexDocumentWithPersistentStorage(t *testing.T) { gob.Register(ScoringFields{}) var engine Engine engine.Init(types.EngineInitOptions{ SegmenterDictionaries: "../testdata/test_dict.txt", DefaultRankOptions: &types.RankOptions{ OutputOffset: 0, MaxOutputs: 10, ScoringCriteria: &RankByTokenProximity{}, }, IndexerInitOptions: &types.IndexerInitOptions{ IndexType: types.LocationsIndex, }, UsePersistentStorage: true, PersistentStorageFolder: "wukong.persistent", PersistentStorageShards: 2, }) AddDocs(&engine) engine.RemoveDocument(4) engine.Close() var engine1 Engine engine1.Init(types.EngineInitOptions{ SegmenterDictionaries: "../testdata/test_dict.txt", DefaultRankOptions: &types.RankOptions{ OutputOffset: 0, MaxOutputs: 10, ScoringCriteria: &RankByTokenProximity{}, }, IndexerInitOptions: &types.IndexerInitOptions{ IndexType: types.LocationsIndex, }, UsePersistentStorage: true, PersistentStorageFolder: "wukong.persistent", PersistentStorageShards: 2, }) outputs := engine1.Search(types.SearchRequest{Text: "中国人口"}) utils.Expect(t, "2", len(outputs.Tokens)) utils.Expect(t, "中国", outputs.Tokens[0]) utils.Expect(t, "人口", outputs.Tokens[1]) utils.Expect(t, "2", len(outputs.Docs)) utils.Expect(t, "1", outputs.Docs[0].DocId) utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000)) utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations) utils.Expect(t, "0", outputs.Docs[1].DocId) utils.Expect(t, "76", int(outputs.Docs[1].Scores[0]*1000)) utils.Expect(t, "[0 18]", outputs.Docs[1].TokenSnippetLocations) engine1.Close() os.RemoveAll("wukong.persistent") }
func TestRemoveDocument(t *testing.T) { var ranker Ranker ranker.Init() ranker.AddDoc(1, DummyScoringFields{ label: "label3", counter: 3, amount: 22.3, }) ranker.AddDoc(2, DummyScoringFields{ label: "label4", counter: 1, amount: 2, }) ranker.AddDoc(3, DummyScoringFields{ label: "label1", counter: 7, amount: 10.3, }) ranker.RemoveDoc(3) criteria := DummyScoringCriteria{} scoredDocs, _ := ranker.Rank([]types.IndexedDocument{ types.IndexedDocument{DocId: 1, TokenProximity: 6}, types.IndexedDocument{DocId: 2, TokenProximity: -1}, types.IndexedDocument{DocId: 3, TokenProximity: 24}, types.IndexedDocument{DocId: 4, TokenProximity: 18}, }, types.RankOptions{ScoringCriteria: criteria}, false) utils.Expect(t, "[1 [25300 ]] [2 [3000 ]] ", scoredDocsToString(scoredDocs)) }
func TestLookupWithBM25(t *testing.T) { var indexer Indexer indexer.Init(types.IndexerInitOptions{ IndexType: types.FrequenciesIndex, BM25Parameters: &types.BM25Parameters{ K1: 1, B: 1, }, }) // doc0 = "token2 token4 token4 token2 token3 token4" indexer.AddDocument(&types.DocumentIndex{ DocId: 0, TokenLength: 6, Keywords: []types.KeywordIndex{ {"token2", 3, []int{0, 21}}, {"token3", 7, []int{28}}, {"token4", 15, []int{7, 14, 35}}, }, }) // doc0 = "token6 token7" indexer.AddDocument(&types.DocumentIndex{ DocId: 1, TokenLength: 2, Keywords: []types.KeywordIndex{ {"token6", 3, []int{0}}, {"token7", 15, []int{7}}, }, }) outputs, _ := indexer.Lookup([]string{"token2", "token3", "token4"}, []string{}, nil, false) // BM25 = log2(3) * (12/9 + 28/17 + 60/33) = 6.3433 utils.Expect(t, "76055", int(outputs[0].BM25*10000)) }
func TestRemoveDocument(t *testing.T) { var engine Engine engine.Init(types.EngineInitOptions{ SegmenterDictionaries: "../testdata/test_dict.txt", DefaultRankOptions: &types.RankOptions{ ScoringCriteria: TestScoringCriteria{}, }, }) AddDocs(&engine) engine.RemoveDocument(4) outputs := engine.Search(types.SearchRequest{Text: "中国人口"}) utils.Expect(t, "1", len(outputs.Docs)) utils.Expect(t, "0", outputs.Docs[0].DocId) utils.Expect(t, "6000", int(outputs.Docs[0].Scores[0]*1000)) }
func TestOpenOrCreateBolt(t *testing.T) { db, err := openBoltStorage("bolt_test") utils.Expect(t, "<nil>", err) db.Close() db, err = openBoltStorage("bolt_test") utils.Expect(t, "<nil>", err) err = db.Set([]byte("key1"), []byte("value1")) utils.Expect(t, "<nil>", err) buffer := make([]byte, 100) buffer, err = db.Get([]byte("key1")) utils.Expect(t, "<nil>", err) utils.Expect(t, "value1", string(buffer)) walFile := db.WALName() db.Close() os.Remove(walFile) os.Remove("bolt_test") }
func TestLookupWithProximity(t *testing.T) { var indexer Indexer indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex}) // doc0 = "token2 token4 token4 token2 token3 token4" indexer.AddDocument(&types.DocumentIndex{ DocId: 0, Keywords: []types.KeywordIndex{ {"token2", 0, []int{0, 21}}, {"token3", 0, []int{28}}, {"token4", 0, []int{7, 14, 35}}, }, }) utils.Expect(t, "[0 1 [21 28]] ", indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false))) // doc0 = "t2 t1 . . . t2 t3" indexer.AddDocument(&types.DocumentIndex{ DocId: 0, Keywords: []types.KeywordIndex{ {"t1", 0, []int{3}}, {"t2", 0, []int{0, 12}}, {"t3", 0, []int{15}}, }, }) utils.Expect(t, "[0 8 [3 12 15]] ", indexedDocsToString(indexer.Lookup([]string{"t1", "t2", "t3"}, []string{}, nil, false))) // doc0 = "t3 t2 t1 . . . . . t2 t3" indexer.AddDocument(&types.DocumentIndex{ DocId: 0, Keywords: []types.KeywordIndex{ {"t1", 0, []int{6}}, {"t2", 0, []int{3, 19}}, {"t3", 0, []int{0, 22}}, }, }) utils.Expect(t, "[0 10 [6 3 0]] ", indexedDocsToString(indexer.Lookup([]string{"t1", "t2", "t3"}, []string{}, nil, false))) }
func TestAddKeywords(t *testing.T) { var indexer Indexer indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex}) indexer.AddDocument(&types.DocumentIndex{ DocId: 1, Keywords: []types.KeywordIndex{{"token1", 0, []int{}}}, }) indexer.AddDocument(&types.DocumentIndex{ DocId: 7, Keywords: []types.KeywordIndex{{"token1", 0, []int{}}}, }) indexer.AddDocument(&types.DocumentIndex{ DocId: 2, Keywords: []types.KeywordIndex{{"token1", 0, []int{}}}, }) indexer.AddDocument(&types.DocumentIndex{ DocId: 3, Keywords: []types.KeywordIndex{{"token2", 0, []int{}}}, }) indexer.AddDocument(&types.DocumentIndex{ DocId: 1, Keywords: []types.KeywordIndex{{"token1", 0, []int{}}}, }) indexer.AddDocument(&types.DocumentIndex{ DocId: 1, Keywords: []types.KeywordIndex{{"token2", 0, []int{}}}, }) indexer.AddDocument(&types.DocumentIndex{ DocId: 2, Keywords: []types.KeywordIndex{{"token2", 0, []int{}}}, }) indexer.AddDocument(&types.DocumentIndex{ DocId: 0, Keywords: []types.KeywordIndex{{"token2", 0, []int{}}}, }) utils.Expect(t, "1 2 7 ", indicesToString(&indexer, "token1")) utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2")) }
func TestRankDocument(t *testing.T) { var ranker Ranker ranker.Init() ranker.AddDoc(1, DummyScoringFields{}) ranker.AddDoc(3, DummyScoringFields{}) ranker.AddDoc(4, DummyScoringFields{}) scoredDocs, _ := ranker.Rank([]types.IndexedDocument{ types.IndexedDocument{DocId: 1, BM25: 6}, types.IndexedDocument{DocId: 3, BM25: 24}, types.IndexedDocument{DocId: 4, BM25: 18}, }, types.RankOptions{ScoringCriteria: types.RankByBM25{}}, false) utils.Expect(t, "[3 [24000 ]] [4 [18000 ]] [1 [6000 ]] ", scoredDocsToString(scoredDocs)) scoredDocs, _ = ranker.Rank([]types.IndexedDocument{ types.IndexedDocument{DocId: 1, BM25: 6}, types.IndexedDocument{DocId: 3, BM25: 24}, types.IndexedDocument{DocId: 2, BM25: 0}, types.IndexedDocument{DocId: 4, BM25: 18}, }, types.RankOptions{ScoringCriteria: types.RankByBM25{}, ReverseOrder: true}, false) // doc0因为没有AddDoc所以没有添加进来 utils.Expect(t, "[1 [6000 ]] [4 [18000 ]] [3 [24000 ]] ", scoredDocsToString(scoredDocs)) }
func TestFrequenciesIndex(t *testing.T) { var engine Engine engine.Init(types.EngineInitOptions{ SegmenterDictionaries: "../testdata/test_dict.txt", DefaultRankOptions: &types.RankOptions{ ScoringCriteria: BM25ScoringCriteria{}, }, IndexerInitOptions: &types.IndexerInitOptions{ IndexType: types.FrequenciesIndex, }, }) AddDocs(&engine) outputs := engine.Search(types.SearchRequest{Text: "中国人口"}) utils.Expect(t, "2", len(outputs.Docs)) utils.Expect(t, "4", outputs.Docs[0].DocId) utils.Expect(t, "2311", int(outputs.Docs[0].Scores[0]*1000)) utils.Expect(t, "0", outputs.Docs[1].DocId) utils.Expect(t, "2211", int(outputs.Docs[1].Scores[0]*1000)) }
func TestCountDocsOnly(t *testing.T) { var engine Engine engine.Init(types.EngineInitOptions{ SegmenterDictionaries: "../testdata/test_dict.txt", DefaultRankOptions: &types.RankOptions{ ReverseOrder: true, OutputOffset: 0, MaxOutputs: 1, ScoringCriteria: &RankByTokenProximity{}, }, IndexerInitOptions: &types.IndexerInitOptions{ IndexType: types.LocationsIndex, }, }) AddDocs(&engine) engine.RemoveDocument(4) outputs := engine.Search(types.SearchRequest{Text: "中国人口", CountDocsOnly: true}) utils.Expect(t, "0", len(outputs.Docs)) utils.Expect(t, "2", len(outputs.Tokens)) utils.Expect(t, "2", outputs.NumDocs) }
func TestOffsetAndMaxOutputs(t *testing.T) { var engine Engine engine.Init(types.EngineInitOptions{ SegmenterDictionaries: "../testdata/test_dict.txt", DefaultRankOptions: &types.RankOptions{ ReverseOrder: true, OutputOffset: 1, MaxOutputs: 3, ScoringCriteria: &RankByTokenProximity{}, }, IndexerInitOptions: &types.IndexerInitOptions{ IndexType: types.LocationsIndex, }, }) AddDocs(&engine) outputs := engine.Search(types.SearchRequest{Text: "中国人口"}) utils.Expect(t, "2", len(outputs.Docs)) utils.Expect(t, "4", outputs.Docs[0].DocId) utils.Expect(t, "1", outputs.Docs[1].DocId) }
func TestSearchWithin(t *testing.T) { var engine Engine engine.Init(types.EngineInitOptions{ SegmenterDictionaries: "../testdata/test_dict.txt", DefaultRankOptions: &types.RankOptions{ ReverseOrder: true, OutputOffset: 0, MaxOutputs: 10, ScoringCriteria: &RankByTokenProximity{}, }, IndexerInitOptions: &types.IndexerInitOptions{ IndexType: types.LocationsIndex, }, }) AddDocs(&engine) docIds := make(map[uint64]bool) docIds[4] = true docIds[0] = true outputs := engine.Search(types.SearchRequest{ Text: "中国人口", DocIds: docIds, }) utils.Expect(t, "2", len(outputs.Tokens)) utils.Expect(t, "中国", outputs.Tokens[0]) utils.Expect(t, "人口", outputs.Tokens[1]) utils.Expect(t, "2", len(outputs.Docs)) utils.Expect(t, "0", outputs.Docs[0].DocId) utils.Expect(t, "76", int(outputs.Docs[0].Scores[0]*1000)) utils.Expect(t, "[0 18]", outputs.Docs[0].TokenSnippetLocations) utils.Expect(t, "4", outputs.Docs[1].DocId) utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000)) utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations) }
func TestLookupWithLocations(t *testing.T) { var indexer Indexer indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex}) // doc0 = "token2 token4 token4 token2 token3 token4" indexer.AddDocument(&types.DocumentIndex{ DocId: 0, Keywords: []types.KeywordIndex{ {"token2", 0, []int{0, 21}}, {"token3", 0, []int{28}}, {"token4", 0, []int{7, 14, 35}}, }, }) docs, _ := indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false) utils.Expect(t, "[[0 21] [28]]", docs[0].TokenLocations) }
func TestLookupWithinDocIds(t *testing.T) { var indexer Indexer indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex}) // doc0 = "token2 token3" indexer.AddDocument(&types.DocumentIndex{ DocId: 0, Keywords: []types.KeywordIndex{ {"token2", 0, []int{0}}, {"token3", 0, []int{7}}, }, }) // doc1 = "token1 token2 token3" indexer.AddDocument(&types.DocumentIndex{ DocId: 1, Keywords: []types.KeywordIndex{ {"token1", 0, []int{0}}, {"token2", 0, []int{7}}, {"token3", 0, []int{14}}, }, }) // doc2 = "token1 token2" indexer.AddDocument(&types.DocumentIndex{ DocId: 2, Keywords: []types.KeywordIndex{ {"token1", 0, []int{0}}, {"token2", 0, []int{7}}, }, }) // doc3 = "token2" indexer.AddDocument(&types.DocumentIndex{ DocId: 3, Keywords: []types.KeywordIndex{ {"token2", 0, []int{0}}, }, }) docIds := make(map[uint64]bool) docIds[0] = true docIds[2] = true utils.Expect(t, "[2 0 [7]] [0 0 [0]] ", indexedDocsToString(indexer.Lookup([]string{"token2"}, []string{}, docIds, false))) }
func TestEngineIndexDocument(t *testing.T) { var engine Engine engine.Init(types.EngineInitOptions{ SegmenterDictionaries: "../testdata/test_dict.txt", DefaultRankOptions: &types.RankOptions{ OutputOffset: 0, MaxOutputs: 10, ScoringCriteria: &RankByTokenProximity{}, }, IndexerInitOptions: &types.IndexerInitOptions{ IndexType: types.LocationsIndex, }, }) AddDocs(&engine) outputs := engine.Search(types.SearchRequest{Text: "中国人口"}) utils.Expect(t, "2", len(outputs.Tokens)) utils.Expect(t, "中国", outputs.Tokens[0]) utils.Expect(t, "人口", outputs.Tokens[1]) utils.Expect(t, "3", len(outputs.Docs)) utils.Expect(t, "1", outputs.Docs[0].DocId) utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000)) utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations) utils.Expect(t, "4", outputs.Docs[1].DocId) utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000)) utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations) utils.Expect(t, "0", outputs.Docs[2].DocId) utils.Expect(t, "76", int(outputs.Docs[2].Scores[0]*1000)) utils.Expect(t, "[0 18]", outputs.Docs[2].TokenSnippetLocations) }
func TestEngineIndexDocumentWithTokens(t *testing.T) { var engine Engine engine.Init(types.EngineInitOptions{ SegmenterDictionaries: "../testdata/test_dict.txt", DefaultRankOptions: &types.RankOptions{ OutputOffset: 0, MaxOutputs: 10, ScoringCriteria: &RankByTokenProximity{}, }, IndexerInitOptions: &types.IndexerInitOptions{ IndexType: types.LocationsIndex, }, }) docId := uint64(0) engine.IndexDocument(docId, types.DocumentIndexData{ Content: "", Tokens: []types.TokenData{ {"中国", []int{0}}, {"人口", []int{18, 24}}, }, Fields: ScoringFields{1, 2, 3}, }) docId++ engine.IndexDocument(docId, types.DocumentIndexData{ Content: "", Tokens: []types.TokenData{ {"中国", []int{0}}, {"人口", []int{6}}, }, Fields: ScoringFields{1, 2, 3}, }) docId++ engine.IndexDocument(docId, types.DocumentIndexData{ Content: "中国十三亿人口", Fields: ScoringFields{0, 9, 1}, }) engine.FlushIndex() outputs := engine.Search(types.SearchRequest{Text: "中国人口"}) utils.Expect(t, "2", len(outputs.Tokens)) utils.Expect(t, "中国", outputs.Tokens[0]) utils.Expect(t, "人口", outputs.Tokens[1]) utils.Expect(t, "3", len(outputs.Docs)) utils.Expect(t, "1", outputs.Docs[0].DocId) utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000)) utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations) utils.Expect(t, "2", outputs.Docs[1].DocId) utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000)) utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations) utils.Expect(t, "0", outputs.Docs[2].DocId) utils.Expect(t, "76", int(outputs.Docs[2].Scores[0]*1000)) utils.Expect(t, "[0 18]", outputs.Docs[2].TokenSnippetLocations) }
func TestLookup(t *testing.T) { var indexer Indexer indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex}) // doc0 = "token2 token3" indexer.AddDocument(&types.DocumentIndex{ DocId: 0, Keywords: []types.KeywordIndex{ {"token2", 0, []int{0}}, {"token3", 0, []int{7}}, }, }) // doc1 = "token1 token2 token3" indexer.AddDocument(&types.DocumentIndex{ DocId: 1, Keywords: []types.KeywordIndex{ {"token1", 0, []int{0}}, {"token2", 0, []int{7}}, {"token3", 0, []int{14}}, }, }) // doc2 = "token1 token2" indexer.AddDocument(&types.DocumentIndex{ DocId: 2, Keywords: []types.KeywordIndex{ {"token1", 0, []int{0}}, {"token2", 0, []int{7}}, }, }) // doc3 = "token2" indexer.AddDocument(&types.DocumentIndex{ DocId: 3, Keywords: []types.KeywordIndex{ {"token2", 0, []int{0}}, }, }) // doc7 = "token1 token3" indexer.AddDocument(&types.DocumentIndex{ DocId: 7, Keywords: []types.KeywordIndex{ {"token1", 0, []int{0}}, {"token3", 0, []int{7}}, }, }) // doc9 = "token3" indexer.AddDocument(&types.DocumentIndex{ DocId: 9, Keywords: []types.KeywordIndex{ {"token3", 0, []int{0}}, }, }) utils.Expect(t, "1 2 7 ", indicesToString(&indexer, "token1")) utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2")) utils.Expect(t, "0 1 7 9 ", indicesToString(&indexer, "token3")) utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token4"}, []string{}, nil, false))) utils.Expect(t, "[7 0 [0]] [2 0 [0]] [1 0 [0]] ", indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil, false))) utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "token4"}, []string{}, nil, false))) utils.Expect(t, "[2 1 [0 7]] [1 1 [0 7]] ", indexedDocsToString(indexer.Lookup([]string{"token1", "token2"}, []string{}, nil, false))) utils.Expect(t, "[2 13 [7 0]] [1 13 [7 0]] ", indexedDocsToString(indexer.Lookup([]string{"token2", "token1"}, []string{}, nil, false))) utils.Expect(t, "[7 1 [0 7]] [1 8 [0 14]] ", indexedDocsToString(indexer.Lookup([]string{"token1", "token3"}, []string{}, nil, false))) utils.Expect(t, "[7 13 [7 0]] [1 20 [14 0]] ", indexedDocsToString(indexer.Lookup([]string{"token3", "token1"}, []string{}, nil, false))) utils.Expect(t, "[1 1 [7 14]] [0 1 [0 7]] ", indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false))) utils.Expect(t, "[1 13 [14 7]] [0 13 [7 0]] ", indexedDocsToString(indexer.Lookup([]string{"token3", "token2"}, []string{}, nil, false))) utils.Expect(t, "[1 2 [0 7 14]] ", indexedDocsToString(indexer.Lookup([]string{"token1", "token2", "token3"}, []string{}, nil, false))) utils.Expect(t, "[1 26 [14 7 0]] ", indexedDocsToString(indexer.Lookup([]string{"token3", "token2", "token1"}, []string{}, nil, false))) }