func (s *Stopword) Process(input tokenizer.TokenChan) tokenizer.TokenChan { return filter.StartFilter(input, func(token *tokenizer.Token, output tokenizer.TokenChan) { if !words[token.Backing()] { output <- token } }) }
func (s *Stemmer) Process(input tokenizer.TokenChan) tokenizer.TokenChan { return filter.StartFilter(input, func(token *tokenizer.Token, output tokenizer.TokenChan) { str := token.Backing() cs := C.CString(str) defer C.free(unsafe.Pointer(cs)) end := C.stem(s.cstemmer, cs, C.int(len(str)-1)) + 1 output <- tokenizer.NewToken(str[0:end]) }) }
func (a *Ascii) Process(input tokenizer.TokenChan) tokenizer.TokenChan { return filter.StartFilter(input, func(token *tokenizer.Token, output tokenizer.TokenChan) { cleaned := strings.Map(func(rune int) int { if rune > 127 { return -1 } return rune }, token.Backing()) output <- tokenizer.NewToken(cleaned) }) }
func (p *Punctuation) Process(input tokenizer.TokenChan) tokenizer.TokenChan { return filter.StartFilter(input, func(token *tokenizer.Token, output tokenizer.TokenChan) { cleaned := strings.Map(func(rune int) int { switch { case 48 <= rune && rune <= 57: // numbers fallthrough case 65 <= rune && rune <= 90: // uppercase fallthrough case 97 <= rune && rune <= 122: // lowercase return rune } return -1 }, token.Backing()) output <- tokenizer.NewToken(cleaned) }) }
// Handle ascii, lowercase, and stripping punctuation in one filter func (s *Superstrip) Process(input tokenizer.TokenChan) tokenizer.TokenChan { return filter.StartFilter(input, func(token *tokenizer.Token, output tokenizer.TokenChan) { cleaned := strings.Map(func(rune int) int { switch { case 48 <= rune && rune <= 57: // numbers fallthrough case 97 <= rune && rune <= 122: // lowercase return rune case 65 <= rune && rune <= 90: // uppercase return rune + 32 // Make lowercase } return -1 }, token.Backing()) if cleaned != "" { output <- tokenizer.NewToken(cleaned) } }) }
func (l *Lowercase) Process(input tokenizer.TokenChan) tokenizer.TokenChan { return filter.StartFilter(input, func(token *tokenizer.Token, output tokenizer.TokenChan) { output <- tokenizer.NewToken(strings.ToLower(token.Backing())) }) }