Example #1
0
// 搜索,返回 [start, end] 之间的结果(文档id集合)以及搜索到的文档总数
func search(searchString string, start, end int) (result []int, total int) {
	// 先按空格将搜索串分成多个句子,并过滤掉空句子
	var sentences []string
	texts := strings.Split(searchString, " ")
	for _, sen := range texts {
		if sen != "" {
			sentences = append(sentences, sen)
		}
	}

	var (
		tempResult       []int               // 临时结果
		words            []string            // 搜索串的关键词集合
		searchResultInfo []*SearchResultInfo // 用来根据tfidfs排序
	)

	// 对每个句子进行分词,句子内对每个词的id集合求并集,句子间对id集合求交集
	var outidsets [][]int
	for _, sen := range sentences {
		ws := participleutil.Participle(sen)
		var inidsets [][]int

		for _, w := range ws {
			var ids []int
			for _, v := range wordMapIndexInfo[w] {
				ids = append(ids, v.id)
			}
			inidsets = append(inidsets, ids)
			words = append(words, w)
		}

		outidsets = append(outidsets, union(inidsets)) // 对句内的集合求并集
	}
	tempResult = inter(outidsets) // 对句间的集合求交集

	// 对tempResult进行排序
	words = clearRepeat(words) // 去重
	for _, id := range tempResult {
		var tfidfs float32 = 0.0
		for _, w := range words {
			tfidfs += getTfIdfByWordId(w, id)
		}
		searchResultInfo = append(searchResultInfo, &SearchResultInfo{id: id, tfidfs: tfidfs})
	}
	sort.Sort(ByTfIdfs(searchResultInfo))

	// 选取 [start, end] 之间的id作为结果
	if start < 0 {
		start = 0
	}
	if end >= len(searchResultInfo) {
		end = len(searchResultInfo) - 1
	}
	for i := start; i <= end; i++ {
		result = append(result, searchResultInfo[i].id)
	}
	total = len(searchResultInfo)

	return result, total
}
Example #2
0
// 计算逆文档频率
func calculateIDF() {
	start := time.Now()

	db, err := sql.Open("sqlite3", conf.ExtractUrlDataPath)
	if err != nil {
		log.Fatal(err, "\r\n")
	}
	defer db.Close()

	rows, err := db.Query("select * from data")
	if err != nil {
		log.Fatal(err, "\r\n")
	}
	defer rows.Close()

	var md5 string
	for rows.Next() {
		rows.Scan(&md5)
		// 计数
		numFile++
		fmt.Println("numFile:", numFile)

		// 读取文档
		content, _ := ioutil.ReadFile(conf.ExtractWebpagePath + md5 + "_body.txt")
		// 得到分词结果
		ss := participleutil.Participle(string(content))
		// 去重
		m := make(map[string]bool)
		for _, v := range ss {
			if !m[v] {
				m[v] = true
			}
		}
		// 保存结果
		for k, _ := range m {
			wordMap[k]++
		}
	}

	fmt.Println("calculateIDF used time:", time.Since(start))
}
// 计算TF-IDF
func calculateTFIDF() {
	start := time.Now()
	// 读取文档数据
	db, err := sql.Open("sqlite3", conf.ExtractUrlDataPath)
	if err != nil {
		log.Fatal(err, "\r\n")
	}
	defer db.Close()

	rows, err := db.Query("select * from data")
	if err != nil {
		log.Fatal(err, "\r\n")
	}
	defer rows.Close()

	var md5 string
	for rows.Next() {
		rows.Scan(&md5)
		// 计数
		numFile++
		fmt.Println("numFile:", numFile)

		// 读取正文文档
		content, _ := ioutil.ReadFile(conf.ExtractWebpagePath + md5 + "_body.txt")
		// 得到分词结果
		ss := participleutil.Participle(string(content))
		totalWord := len(ss) // 文档的总词数
		// 统计每个词在这篇文档中出现的次数
		m := make(map[string]int)
		for _, v := range ss {
			m[v]++
		}

		// 读取标题文档
		content, _ = ioutil.ReadFile(conf.ExtractWebpagePath + md5 + "_title.txt")
		ss = participleutil.Participle(string(content))
		for _, v := range ss {
			m[v] += 5
		}

		docid := docID[md5] // 文档ID

		for k, v := range m {
			tf := float32(float32(v) / float32(totalWord)) // 词频
			idf := wordIDF[k]                              // 逆文档频率
			words = append(words, k)                       // 关键词
			docids = append(docids, docid)                 // 文档标号
			tfidfs = append(tfidfs, tf*idf)                // TF-IDF值
			numWord++
			if numWord%2000000 == 0 {
				writeDatabase()
				words = []string{}
				docids = []int{}
				tfidfs = []float32{}
			}
		}
	}

	writeDatabase()
	fmt.Println("calculateTFIDF used time:", time.Since(start))
}