forked from mattn/go-haiku
/
haiku.go
128 lines (119 loc) · 2.5 KB
/
haiku.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
package haiku
import (
"regexp"
"github.com/ikawaha/kagome"
)
var (
reWord = regexp.MustCompile(`^[ァ-ヾ]+$`)
reIgnoreText = regexp.MustCompile(`[\[\]「」『』]`)
reIgnoreChar = regexp.MustCompile(`[ァィゥェォャュョ]`)
)
// isWord return true when the kind of the word is possible to be leading of
// sentence.
func isWord(c []string) bool {
for _, f := range []string{"名詞", "動詞", "形容詞", "形容動詞", "副詞", "連体詞", "接続詞", "感動詞", "接頭詞", "フィラー"} {
if f == c[0] && c[1] != "非自立" && c[1] != "接尾" {
return true
}
}
return false
}
// countChars return count of characters with ignoring japanese small letters.
func countChars(s string) int {
return len([]rune(reIgnoreChar.ReplaceAllString(s, "")))
}
// Match return true when text matches with rule(s).
func Match(text string, rule []int) bool {
t := kagome.NewTokenizer()
text = reIgnoreText.ReplaceAllString(text, "")
tokens := t.Tokenize(text)
pos := 0
r := make([]int, len(rule))
copy(r, rule)
for i := 0; i < len(tokens); i++ {
tok := tokens[i]
c := tok.Features()
if len(c) == 0 {
continue
}
y := c[len(c)-1]
if !reWord.MatchString(y) {
if y == "、" {
continue
}
return false
}
if r[pos] == rule[pos] && !isWord(c) {
return false
}
n := countChars(y)
r[pos] -= n
if r[pos] == 0 {
pos++
if pos == len(r) && i == len(tokens)-2 {
return true
}
}
}
return false
}
// Find returns sentences that text matches with rule(s).
func Find(text string, rule []int) []string {
if len(rule) == 0 {
return nil
}
t := kagome.NewTokenizer()
text = reIgnoreText.ReplaceAllString(text, "")
tokens := t.Tokenize(text)
pos := 0
r := make([]int, len(rule))
copy(r, rule)
sentence := ""
start := 0
ret := []string{}
for i := 0; i < len(tokens); i++ {
tok := tokens[i]
c := tok.Features()
if len(c) == 0 {
continue
}
y := c[len(c)-1]
if !reWord.MatchString(y) {
if y == "、" {
continue
}
pos = 0
sentence = ""
copy(r, rule)
continue
}
if r[pos] == rule[pos] && !isWord(c) {
pos = 0
sentence = ""
copy(r, rule)
continue
}
n := countChars(y)
r[pos] -= n
sentence += tok.Surface
if r[pos] == 0 {
pos++
if pos >= len(r) {
ret = append(ret, sentence)
start = i + 1
pos = 0
sentence = ""
copy(r, rule)
continue
}
sentence += " "
} else if r[pos] < 0 {
i = start + 1
start++
pos = 0
sentence = ""
copy(r, rule)
}
}
return ret
}