/
stemmer.go
75 lines (60 loc) · 1.5 KB
/
stemmer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
package cobe
import (
"regexp"
"strings"
"unicode"
"bitbucket.org/tebeka/snowball"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
)
type stemmer interface {
Stem(word string) string
}
// Wrap a snowball stemmer in one that also stems smileys.
type cobeStemmer struct {
sub stemmer
words *regexp.Regexp
smiley *regexp.Regexp
frowny *regexp.Regexp
}
func newCobeStemmer(s *snowball.Stemmer) *cobeStemmer {
cs := cobeStemmer{sub: s}
cs.words = regexp.MustCompile(`\w`)
cs.smiley = regexp.MustCompile(`:-?[ \)]*\)|☺|☺️`)
cs.frowny = regexp.MustCompile(`:-?[' \(]*\(|☹|😦`)
return &cs
}
func (s *cobeStemmer) Stem(token string) string {
// Tokens with a word character go through the snowball stemmer.
if s.words.FindString(token) != "" {
return s.sub.Stem(stripAccents(strings.ToLower(token)))
}
if s.smiley.FindString(token) != "" {
return ":)"
}
if s.frowny.FindString(token) != "" {
return ":("
}
return ""
}
// stripAccents attempts to replace accented characters with an ASCII
// equivalent. This is an extreme oversimplication, but since cobe
// only uses this to create token equivalence (these strings are never
// displayed) it gets a pass.
func stripAccents(s string) string {
s2, _, err := transform.String(stripT, s)
if err != nil {
return s
}
return s2
}
var stripT transform.Transformer
func init() {
stripT = transform.Chain(
norm.NFD,
transform.RemoveFunc(isMn),
norm.NFC)
}
func isMn(r rune) bool {
return unicode.Is(unicode.Mn, r)
}