Beispiel #1
0
func (s *Stopword) Process(input tokenizer.TokenChan) tokenizer.TokenChan {
	return filter.StartFilter(input, func(token *tokenizer.Token, output tokenizer.TokenChan) {
		if !words[token.Backing()] {
			output <- token
		}
	})
}
Beispiel #2
0
func (s *Stemmer) Process(input tokenizer.TokenChan) tokenizer.TokenChan {
	return filter.StartFilter(input, func(token *tokenizer.Token, output tokenizer.TokenChan) {
		str := token.Backing()
		cs := C.CString(str)
		defer C.free(unsafe.Pointer(cs))
		end := C.stem(s.cstemmer, cs, C.int(len(str)-1)) + 1
		output <- tokenizer.NewToken(str[0:end])
	})
}
Beispiel #3
0
func (a *Ascii) Process(input tokenizer.TokenChan) tokenizer.TokenChan {
	return filter.StartFilter(input, func(token *tokenizer.Token, output tokenizer.TokenChan) {
		cleaned := strings.Map(func(rune int) int {
			if rune > 127 {
				return -1
			}
			return rune
		}, token.Backing())
		output <- tokenizer.NewToken(cleaned)
	})
}
Beispiel #4
0
func (p *Punctuation) Process(input tokenizer.TokenChan) tokenizer.TokenChan {
	return filter.StartFilter(input, func(token *tokenizer.Token, output tokenizer.TokenChan) {
		cleaned := strings.Map(func(rune int) int {
			switch {
			case 48 <= rune && rune <= 57: // numbers
				fallthrough
			case 65 <= rune && rune <= 90: // uppercase
				fallthrough
			case 97 <= rune && rune <= 122: // lowercase
				return rune
			}
			return -1
		}, token.Backing())
		output <- tokenizer.NewToken(cleaned)
	})
}
Beispiel #5
0
// Handle ascii, lowercase, and stripping punctuation in one filter
func (s *Superstrip) Process(input tokenizer.TokenChan) tokenizer.TokenChan {
	return filter.StartFilter(input, func(token *tokenizer.Token, output tokenizer.TokenChan) {
		cleaned := strings.Map(func(rune int) int {
			switch {
			case 48 <= rune && rune <= 57: // numbers
				fallthrough
			case 97 <= rune && rune <= 122: // lowercase
				return rune
			case 65 <= rune && rune <= 90: // uppercase
				return rune + 32 // Make lowercase
			}
			return -1
		}, token.Backing())
		if cleaned != "" {
			output <- tokenizer.NewToken(cleaned)
		}
	})
}
Beispiel #6
0
func (l *Lowercase) Process(input tokenizer.TokenChan) tokenizer.TokenChan {
	return filter.StartFilter(input, func(token *tokenizer.Token, output tokenizer.TokenChan) {
		output <- tokenizer.NewToken(strings.ToLower(token.Backing()))
	})
}