Пример #1
0
func main() {
	// 解析命令行参数
	flag.Parse()
	searchQueries = strings.Split(*queries, ",")
	log.Printf("待搜索的关键词为\"%s\"", searchQueries)

	// 初始化
	tBeginInit := time.Now()
	searcher.Init(types.EngineInitOptions{
		SegmenterDictionaries: *dictionaries,
		StopTokenFile:         *stop_token_file,
		IndexerInitOptions: &types.IndexerInitOptions{
			IndexType: *index_type,
		},
		NumShards:               NumShards,
		DefaultRankOptions:      &options,
		UsePersistentStorage:    *use_persistent,
		PersistentStorageFolder: *persistent_storage_folder,
		PersistentStorageShards: *persistent_storage_shards,
	})
	tEndInit := time.Now()
	defer searcher.Close()

	// 打开将要搜索的文件
	file, err := os.Open(*weibo_data)
	if err != nil {
		log.Fatal(err)
	}
	defer file.Close()

	// 逐行读入
	log.Printf("读入文本 %s", *weibo_data)
	scanner := bufio.NewScanner(file)
	lines := []string{}
	size := 0
	for scanner.Scan() {
		var text string
		data := strings.Split(scanner.Text(), "||||")
		if len(data) != 10 {
			continue
		}
		text = data[9]
		if text != "" {
			size += len(text) * (*num_repeat_text)
			lines = append(lines, text)
		}
	}
	log.Print("文件行数", len(lines))

	// 记录时间
	t0 := time.Now()

	// 建索引
	log.Print("建索引 ... ")
	docId := uint64(1)
	for i := 0; i < *num_repeat_text; i++ {
		for _, line := range lines {
			searcher.IndexDocument(docId, types.DocumentIndexData{
				Content: line})
			docId++
			if docId-docId/1000000*1000000 == 0 {
				log.Printf("已索引%d百万文档", docId/1000000)
				runtime.GC()
			}
		}
	}
	searcher.FlushIndex()
	log.Print("加入的索引总数", searcher.NumTokenIndexAdded())

	// 记录时间
	t1 := time.Now()
	log.Printf("建立索引花费时间 %v", t1.Sub(t0))
	log.Printf("建立索引速度每秒添加 %f 百万个索引",
		float64(searcher.NumTokenIndexAdded())/t1.Sub(t0).Seconds()/(1000000))

	// 写入内存profile文件
	if *memprofile != "" {
		f, err := os.Create(*memprofile)
		if err != nil {
			log.Fatal(err)
		}
		pprof.WriteHeapProfile(f)
		defer f.Close()
	}

	// 记录时间
	t2 := time.Now()

	// 打开处理器profile文件
	if *cpuprofile != "" {
		f, err := os.Create(*cpuprofile)
		if err != nil {
			log.Fatal(err)
		}
		pprof.StartCPUProfile(f)
		defer pprof.StopCPUProfile()
	}

	done := make(chan bool)
	for iThread := 0; iThread < numQueryThreads; iThread++ {
		go search(done)
	}
	for iThread := 0; iThread < numQueryThreads; iThread++ {
		<-done
	}

	// 停止处理器profile
	if *cpuprofile != "" {
		defer pprof.StopCPUProfile()
	}

	// 记录时间并计算分词速度
	t3 := time.Now()
	log.Printf("搜索平均响应时间 %v 毫秒",
		t3.Sub(t2).Seconds()*1000/float64(numRepeatQuery*len(searchQueries)))
	log.Printf("搜索吞吐量每秒 %v 次查询",
		float64(numRepeatQuery*numQueryThreads*len(searchQueries))/
			t3.Sub(t2).Seconds())

	if *use_persistent {
		searcher.Close()
		t4 := time.Now()
		searcher1 := engine.Engine{}
		searcher1.Init(types.EngineInitOptions{
			SegmenterDictionaries: *dictionaries,
			StopTokenFile:         *stop_token_file,
			IndexerInitOptions: &types.IndexerInitOptions{
				IndexType: *index_type,
			},
			NumShards:               NumShards,
			DefaultRankOptions:      &options,
			UsePersistentStorage:    *use_persistent,
			PersistentStorageFolder: *persistent_storage_folder,
			PersistentStorageShards: *persistent_storage_shards,
		})
		defer searcher1.Close()
		t5 := time.Now()
		t := t5.Sub(t4).Seconds() - tEndInit.Sub(tBeginInit).Seconds()
		log.Print("从持久存储加入的索引总数", searcher1.NumTokenIndexAdded())
		log.Printf("从持久存储建立索引花费时间 %v", t)
		log.Printf("从持久存储建立索引速度每秒添加 %f 百万个索引",
			float64(searcher1.NumTokenIndexAdded())/t/(1000000))

	}
	os.RemoveAll(*persistent_storage_folder)
}
Пример #2
0
func main() {
	// 解析命令行参数
	flag.Parse()
	searchQueries = strings.Split(*queries, ",")
	log.Printf("待搜索的关键词为\"%s\"", searchQueries)

	// 初始化
	tBeginInit := time.Now()
	searcher.Init(types.EngineInitOptions{
		SegmenterDictionaries: *dictionaries,
		StopTokenFile:         *stop_token_file,
		IndexerInitOptions: &types.IndexerInitOptions{
			IndexType: *index_type,
		},
		NumShards:               NumShards,
		DefaultRankOptions:      &options,
		UsePersistentStorage:    *use_persistent,
		PersistentStorageFolder: *persistent_storage_folder,
		PersistentStorageShards: *persistent_storage_shards,
	})
	tEndInit := time.Now()
	defer searcher.Close()

	// 打开将要搜索的文件
	file, err := os.Open(*weibo_data)
	if err != nil {
		log.Fatal(err)
	}
	defer file.Close()

	// 逐行读入
	log.Printf("读入文本 %s", *weibo_data)
	scanner := bufio.NewScanner(file)
	lines := []string{}
	size := 0
	for scanner.Scan() {
		var text string
		data := strings.Split(scanner.Text(), "||||")
		if len(data) != 10 {
			continue
		}
		text = data[9]
		if text != "" {
			size += len(text) * (*num_repeat_text)
			lines = append(lines, text)
		}
	}
	log.Print("文件行数", len(lines))

	// 记录时间
	t0 := time.Now()

	// 打开处理器profile文件
	if *cpuprofile != "" {
		f, err := os.Create(*cpuprofile)
		if err != nil {
			log.Fatal(err)
		}
		pprof.StartCPUProfile(f)
		defer pprof.StopCPUProfile()
	}

	// 建索引
	log.Print("建索引 ... ")
	// 打乱 docId 顺序进行测试,若 docId 最大值超 Int 则不能用 rand.Perm 方法
	docIds := rand.Perm(*num_repeat_text * len(lines))
	docIdx := 0
	for i := 0; i < *num_repeat_text; i++ {
		for _, line := range lines {
			searcher.IndexDocument(uint64(docIds[docIdx]+1), types.DocumentIndexData{
				Content: line}, false)
			docIdx++
			if docIdx-docIdx/1000000*1000000 == 0 {
				log.Printf("已索引%d百万文档", docIdx/1000000)
				runtime.GC()
			}
		}
	}
	searcher.FlushIndex()
	log.Print("加入的索引总数", searcher.NumTokenIndexAdded())

	// 记录时间
	t1 := time.Now()
	log.Printf("建立索引花费时间 %v", t1.Sub(t0))
	log.Printf("建立索引速度每秒添加 %f 百万个索引",
		float64(searcher.NumTokenIndexAdded())/t1.Sub(t0).Seconds()/(1000000))

	// 记录时间并计算删除索引时间
	t2 := time.Now()
	for i := 1; i <= *num_delete_docs; i++ {
		searcher.RemoveDocument(uint64(i), false)
	}
	searcher.FlushIndex()

	t3 := time.Now()
	log.Printf("删除 %d 条索引花费时间 %v", *num_delete_docs, t3.Sub(t2))

	// 手动做 GC 防止影响性能测试
	time.Sleep(time.Second)
	runtime.GC()

	// 写入内存profile文件
	if *memprofile != "" {
		f, err := os.Create(*memprofile)
		if err != nil {
			log.Fatal(err)
		}
		pprof.WriteHeapProfile(f)
		defer f.Close()
	}

	t4 := time.Now()
	done := make(chan bool)
	recordResponse := recordResponseLock{}
	recordResponse.count = make(map[string]int)
	for iThread := 0; iThread < numQueryThreads; iThread++ {
		go search(done, &recordResponse)
	}
	for iThread := 0; iThread < numQueryThreads; iThread++ {
		<-done
	}

	// 记录时间并计算分词速度
	t5 := time.Now()
	log.Printf("搜索平均响应时间 %v 毫秒",
		t5.Sub(t4).Seconds()*1000/float64(numRepeatQuery*len(searchQueries)))
	log.Printf("搜索吞吐量每秒 %v 次查询",
		float64(numRepeatQuery*numQueryThreads*len(searchQueries))/
			t5.Sub(t4).Seconds())

	// 测试搜索结果输出,因为不同 case 的 docId 对应不上,所以只测试总数
	recordResponse.RLock()
	for keyword, count := range recordResponse.count {
		log.Printf("关键词 [%s] 共搜索到 %d 个相关文档", keyword, count)
	}
	recordResponse.RUnlock()

	if *use_persistent {
		searcher.Close()
		t6 := time.Now()
		searcher1 := engine.Engine{}
		searcher1.Init(types.EngineInitOptions{
			SegmenterDictionaries: *dictionaries,
			StopTokenFile:         *stop_token_file,
			IndexerInitOptions: &types.IndexerInitOptions{
				IndexType: *index_type,
			},
			NumShards:               NumShards,
			DefaultRankOptions:      &options,
			UsePersistentStorage:    *use_persistent,
			PersistentStorageFolder: *persistent_storage_folder,
			PersistentStorageShards: *persistent_storage_shards,
		})
		defer searcher1.Close()
		t7 := time.Now()
		t := t7.Sub(t6).Seconds() - tEndInit.Sub(tBeginInit).Seconds()
		log.Print("从持久存储加入的索引总数", searcher1.NumTokenIndexAdded())
		log.Printf("从持久存储建立索引花费时间 %v 秒", t)
		log.Printf("从持久存储建立索引速度每秒添加 %f 百万个索引",
			float64(searcher1.NumTokenIndexAdded())/t/(1000000))

	}
	//os.RemoveAll(*persistent_storage_folder)
}