forked from peterhellberg/tpb-search
/
index.go
129 lines (104 loc) · 2.58 KB
/
index.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
package main
import (
"compress/gzip"
"encoding/csv"
"fmt"
"io"
"log"
"os"
"strconv"
"time"
"github.com/blevesearch/bleve"
)
func buildIndexMapping() *bleve.IndexMapping {
// a generic reusable mapping for keyword text
keywordFieldMapping := bleve.NewTextFieldMapping()
keywordFieldMapping.Analyzer = "keyword"
torrentMapping := bleve.NewDocumentMapping()
torrentMapping.AddFieldMappingsAt("category", keywordFieldMapping)
im := bleve.NewIndexMapping()
im.AddDocumentMapping("torrent", torrentMapping)
im.TypeField = "type"
im.DefaultAnalyzer = "standard"
im.DefaultType = "torrent"
im.DefaultField = "name"
return im
}
type tpbDoc struct {
Name string `json:"name"`
Size int64 `json:"size"`
Hash string `json:"hash"`
Category string `json:"category"`
Type string `json:"type"`
}
func indexTPB(i bleve.Index) error {
batch := bleve.NewBatch()
batchCount := 0
gzDumpFile, err := os.Open(*dump)
if err != nil {
return err
}
defer gzDumpFile.Close()
dumpFile, err := gzip.NewReader(gzDumpFile)
if err != nil {
return err
}
reader := csv.NewReader(dumpFile)
reader.FieldsPerRecord = 7
reader.Comma = '|'
count := 0
startTime := time.Now()
log.Printf("Indexing...")
for {
r, err := reader.Read()
if err == io.EOF {
break
} else if err != nil {
continue
}
size, err := strconv.ParseInt(r[1], 10, 0)
if err != nil {
fmt.Println("%#v", size)
size = 0
}
batch.Index(r[2], tpbDoc{
Name: r[0],
Size: size,
Hash: r[2],
Category: r[4],
Type: "torrent",
})
batchCount++
if batchCount >= *batchSize {
err = i.Batch(batch)
if err != nil {
return err
}
batch = bleve.NewBatch()
batchCount = 0
}
count++
if count%1000 == 0 {
indexDuration := time.Since(startTime)
indexDurationSeconds := float64(indexDuration) / float64(time.Second)
timePerDoc := float64(indexDuration) / float64(count)
log.Printf("Indexed %d documents in %.2fs (average %.2fms/doc)", count, indexDurationSeconds, timePerDoc/float64(time.Millisecond))
}
if *indexLimit > 0 && count >= *indexLimit {
break
}
}
// flush the last batch
if batchCount > 0 {
err := i.Batch(batch)
if err != nil {
log.Fatal(err)
}
}
indexDuration := time.Since(startTime)
indexDurationSeconds := float64(indexDuration) / float64(time.Second)
timePerDoc := float64(indexDuration) / float64(count)
log.Printf("Finished indexing %d documents in %.2fs (average %.2fms/doc)", count, indexDurationSeconds, timePerDoc/float64(time.Millisecond))
log.Printf("Still listening on http://%v", bind)
return nil
}