forked from mateusg/shield
/
tokenizer.go
55 lines (43 loc) · 1.25 KB
/
tokenizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
package shield
import (
"io/ioutil"
"regexp"
"strings"
)
type enTokenizer struct {
}
type ptBRTokenizer struct {
}
func NewEnglishTokenizer() Tokenizer {
return &enTokenizer{}
}
func NewPortugueseTokenizer() Tokenizer {
return &ptBRTokenizer{}
}
func PerformTokenization(text string, splitToken *regexp.Regexp) (words map[string]int64) {
words = make(map[string]int64)
for _, w := range splitToken.Split(text, -1) {
if len(w) > 2 {
words[strings.ToLower(w)]++
}
}
return
}
func LoadStopListForLocale(locale string) []string {
fileBytes, _ := ioutil.ReadFile("./stoplists/" + locale + ".txt")
fileContent := string(fileBytes)
return strings.Split(fileContent, "\n")
}
// Spamassassin stoplist
//
// http://wiki.apache.org/spamassassin/BayesStopList
func (t *enTokenizer) Tokenize(text string) map[string]int64 {
return PerformTokenization(text, enToken)
}
func (t *ptBRTokenizer) Tokenize(text string) map[string]int64 {
return PerformTokenization(text, ptBRToken)
}
var enWords = LoadStopListForLocale("en")
var enToken = regexp.MustCompile(`\b([^\w]+|` + strings.Join(enWords, "|") + `)\b`)
var ptBRWords = LoadStopListForLocale("pt-BR")
var ptBRToken = regexp.MustCompile(`\b([^\w]+|` + strings.Join(ptBRWords, "|") + `)\b`)