// 搜索,返回 [start, end] 之间的结果(文档id集合)以及搜索到的文档总数 func search(searchString string, start, end int) (result []int, total int) { // 先按空格将搜索串分成多个句子,并过滤掉空句子 var sentences []string texts := strings.Split(searchString, " ") for _, sen := range texts { if sen != "" { sentences = append(sentences, sen) } } var ( tempResult []int // 临时结果 words []string // 搜索串的关键词集合 searchResultInfo []*SearchResultInfo // 用来根据tfidfs排序 ) // 对每个句子进行分词,句子内对每个词的id集合求并集,句子间对id集合求交集 var outidsets [][]int for _, sen := range sentences { ws := participleutil.Participle(sen) var inidsets [][]int for _, w := range ws { var ids []int for _, v := range wordMapIndexInfo[w] { ids = append(ids, v.id) } inidsets = append(inidsets, ids) words = append(words, w) } outidsets = append(outidsets, union(inidsets)) // 对句内的集合求并集 } tempResult = inter(outidsets) // 对句间的集合求交集 // 对tempResult进行排序 words = clearRepeat(words) // 去重 for _, id := range tempResult { var tfidfs float32 = 0.0 for _, w := range words { tfidfs += getTfIdfByWordId(w, id) } searchResultInfo = append(searchResultInfo, &SearchResultInfo{id: id, tfidfs: tfidfs}) } sort.Sort(ByTfIdfs(searchResultInfo)) // 选取 [start, end] 之间的id作为结果 if start < 0 { start = 0 } if end >= len(searchResultInfo) { end = len(searchResultInfo) - 1 } for i := start; i <= end; i++ { result = append(result, searchResultInfo[i].id) } total = len(searchResultInfo) return result, total }
// 计算逆文档频率 func calculateIDF() { start := time.Now() db, err := sql.Open("sqlite3", conf.ExtractUrlDataPath) if err != nil { log.Fatal(err, "\r\n") } defer db.Close() rows, err := db.Query("select * from data") if err != nil { log.Fatal(err, "\r\n") } defer rows.Close() var md5 string for rows.Next() { rows.Scan(&md5) // 计数 numFile++ fmt.Println("numFile:", numFile) // 读取文档 content, _ := ioutil.ReadFile(conf.ExtractWebpagePath + md5 + "_body.txt") // 得到分词结果 ss := participleutil.Participle(string(content)) // 去重 m := make(map[string]bool) for _, v := range ss { if !m[v] { m[v] = true } } // 保存结果 for k, _ := range m { wordMap[k]++ } } fmt.Println("calculateIDF used time:", time.Since(start)) }
// 计算TF-IDF func calculateTFIDF() { start := time.Now() // 读取文档数据 db, err := sql.Open("sqlite3", conf.ExtractUrlDataPath) if err != nil { log.Fatal(err, "\r\n") } defer db.Close() rows, err := db.Query("select * from data") if err != nil { log.Fatal(err, "\r\n") } defer rows.Close() var md5 string for rows.Next() { rows.Scan(&md5) // 计数 numFile++ fmt.Println("numFile:", numFile) // 读取正文文档 content, _ := ioutil.ReadFile(conf.ExtractWebpagePath + md5 + "_body.txt") // 得到分词结果 ss := participleutil.Participle(string(content)) totalWord := len(ss) // 文档的总词数 // 统计每个词在这篇文档中出现的次数 m := make(map[string]int) for _, v := range ss { m[v]++ } // 读取标题文档 content, _ = ioutil.ReadFile(conf.ExtractWebpagePath + md5 + "_title.txt") ss = participleutil.Participle(string(content)) for _, v := range ss { m[v] += 5 } docid := docID[md5] // 文档ID for k, v := range m { tf := float32(float32(v) / float32(totalWord)) // 词频 idf := wordIDF[k] // 逆文档频率 words = append(words, k) // 关键词 docids = append(docids, docid) // 文档标号 tfidfs = append(tfidfs, tf*idf) // TF-IDF值 numWord++ if numWord%2000000 == 0 { writeDatabase() words = []string{} docids = []int{} tfidfs = []float32{} } } } writeDatabase() fmt.Println("calculateTFIDF used time:", time.Since(start)) }