func DoubleMetaphone(in nltk.TokenChan) nltk.TokenChan { return start(in, func(tok nltk.Token, out nltk.TokenChan) { cs := C.CString(tok.String()) defer C.free(unsafe.Pointer(cs)) codes := C.double_metaphone(cs) primary, secondary := C.GoString(codes.primary), C.GoString(codes.secondary) defer C.free_dm_result(codes) out <- nltk.Token(primary) if primary != secondary { out <- nltk.Token(secondary) } }) }
func Ascii(in nltk.TokenChan) nltk.TokenChan { return start(in, func(tok nltk.Token, out nltk.TokenChan) { cleaned := strings.Map(func(r rune) rune { if r > 127 { return -1 } return r }, tok.String()) out <- nltk.Token(cleaned) }) }
func Simple(strs ...string) nltk.TokenChan { tc := make(nltk.TokenChan, 10) go func() { for _, str := range strs { for _, t := range strings.Fields(str) { tc <- nltk.Token(t) } } close(tc) }() return tc }
func Punctuation(in nltk.TokenChan) nltk.TokenChan { return start(in, func(tok nltk.Token, out nltk.TokenChan) { cleaned := strings.Map(func(r rune) rune { switch { case 48 <= r && r <= 57: // numbers fallthrough case 65 <= r && r <= 90: // uppercase fallthrough case 97 <= r && r <= 122: // lowercase return r } return -1 }, tok.String()) out <- nltk.Token(cleaned) }) }
func Superstrip(in nltk.TokenChan) nltk.TokenChan { return start(in, func(tok nltk.Token, out nltk.TokenChan) { cleaned := strings.Map(func(r rune) rune { switch { case 48 <= r && r <= 57: // numbers fallthrough case 97 <= r && r <= 122: // lowercase return r case 65 <= r && r <= 90: // uppercase return r + 32 // Make lowercase } return -1 }, tok.String()) if cleaned != "" { out <- nltk.Token(cleaned) } }) }
func Lowercase(in nltk.TokenChan) nltk.TokenChan { return start(in, func(tok nltk.Token, out nltk.TokenChan) { out <- nltk.Token(strings.ToLower(tok.String())) }) }
func stemmerFilter(in nltk.TokenChan, s stemmer.Stemmer) nltk.TokenChan { return start(in, func(tok nltk.Token, out nltk.TokenChan) { out <- nltk.Token(s.Stem(tok.String())) }) }