/
populate.go
117 lines (98 loc) · 2.26 KB
/
populate.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
package main
import (
"database/sql"
"encoding/csv"
"io"
"log"
"os"
_ "slowteetoe.com/ngrams/Godeps/_workspace/src/gopkg.in/cq.v1"
"strconv"
"strings"
)
func prepareInsert(tx *sql.Tx) *sql.Stmt {
combined, err := tx.Prepare(`merge(n:Ngram { phrase: {0} } ) create unique (n)-[:PRECEDED {p: {2} }]->(nw:NextWord { word: {1} } )`)
if err != nil {
log.Fatal(err)
}
return combined
}
type ngram struct {
phrase string
nextWord string
prob float64
}
// phrase, freq, probablility
func parseCSVRecord(record []string) ngram {
var phrase string
var nextWord string
i := strings.LastIndex(record[0], " ")
if i == -1 {
log.Fatalf("Could not find a space in %v, this seems likely to be a fatal error", record[0])
}
phrase = record[0][0:i]
nextWord = record[0][i+1:]
prob, err := strconv.ParseFloat(record[2], 64)
if err != nil {
log.Fatal(err)
}
return ngram{phrase: phrase, nextWord: nextWord, prob: prob}
}
// Don't forget to run
// MATCH (n) OPTIONAL MATCH (n)-[r]-() DELETE n,r
// create index on :Ngram(phrase)
func main() {
db, err := sql.Open("neo4j-cypher", "http://neo4j:n304j@localhost:7474")
if err != nil {
log.Fatal(err)
}
defer db.Close()
files := []string{"output.csv"}
for _, f := range files {
file, err := os.Open(f)
if err != nil {
log.Printf("Skipping %v due to %v\n", f, err)
continue
}
defer file.Close()
tx, err := db.Begin()
if err != nil {
log.Fatal(err)
}
combined := prepareInsert(tx)
defer combined.Close()
reader := csv.NewReader(file)
lineCount := 0
batchSize := 5000
for {
record, err := reader.Read()
if err == io.EOF {
break
} else if err != nil {
log.Fatal(err)
}
ngram := parseCSVRecord(record)
_, err = combined.Exec(ngram.phrase, ngram.nextWord, ngram.prob)
if err != nil {
log.Fatal(err)
}
lineCount += 1
if lineCount%batchSize == 0 {
log.Printf("Committing %v entries and starting a new transaction, current total is: %v\n", batchSize, lineCount)
err = tx.Commit()
if err != nil {
log.Fatal(err)
}
tx, err = db.Begin()
if err != nil {
log.Fatal(err)
}
combined = prepareInsert(tx)
}
}
err = tx.Commit()
if err != nil {
log.Fatal(err)
}
log.Printf("Recorded %v records\n", lineCount)
}
}