func (s *CJKWidthFilter) Filter(input analysis.TokenStream) analysis.TokenStream { for _, token := range input { runeCount := utf8.RuneCount(token.Term) runes := bytes.Runes(token.Term) for i := 0; i < runeCount; i++ { ch := runes[i] if ch >= 0xFF01 && ch <= 0xFF5E { // fullwidth ASCII variants runes[i] -= 0xFEE0 } else if ch >= 0xFF65 && ch <= 0xFF9F { // halfwidth Katakana variants if (ch == 0xFF9E || ch == 0xFF9F) && i > 0 && combine(runes, i, ch) { runes = analysis.DeleteRune(runes, i) i-- runeCount = len(runes) } else { runes[i] = kanaNorm[ch-0xFF65] } } } token.Term = analysis.BuildTermFromRunes(runes) } return input }
func (s *NgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream { rv := make(analysis.TokenStream, 0, len(input)) for _, token := range input { runeCount := utf8.RuneCount(token.Term) runes := bytes.Runes(token.Term) for i := 0; i < runeCount; i++ { // index of the starting rune for this token for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ { // build an ngram of this size starting at i if i+ngramSize <= runeCount { ngramTerm := analysis.BuildTermFromRunes(runes[i : i+ngramSize]) token := analysis.Token{ Position: token.Position, Start: token.Start, End: token.End, Type: token.Type, Term: ngramTerm, } rv = append(rv, &token) } } } } return rv }
func (s *FrenchLightStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { for _, token := range input { runes := bytes.Runes(token.Term) runes = stem(runes) token.Term = analysis.BuildTermFromRunes(runes) } return input }
func (s *IndicNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { for _, token := range input { runes := bytes.Runes(token.Term) runes = normalize(runes) token.Term = analysis.BuildTermFromRunes(runes) } return input }
func (s *PorterStemmer) Filter(input analysis.TokenStream) analysis.TokenStream { for _, token := range input { // if it is not a protected keyword, stem it if !token.KeyWord { termRunes := bytes.Runes(token.Term) stemmedRunes := porterstemmer.StemWithoutLowerCasing(termRunes) token.Term = analysis.BuildTermFromRunes(stemmedRunes) } } return input }
func stem(input []byte) []byte { runes := bytes.Runes(input) // Strip a single prefix. for _, p := range prefixes { if canStemPrefix(runes, p) { runes = runes[len(p):] break } } // Strip off multiple suffixes, in their order in the suffixes array. for _, s := range suffixes { if canStemSuffix(runes, s) { runes = runes[:len(runes)-len(s)] } } return analysis.BuildTermFromRunes(runes) }
func normalize(input []byte) []byte { runes := bytes.Runes(input) for i := 0; i < len(runes); i++ { switch runes[i] { case AlefMadda, AlefHamzaAbove, AlefHamzaBelow: runes[i] = Alef case DotlessYeh: runes[i] = Yeh case TehMarbuta: runes[i] = Heh case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun: runes = analysis.DeleteRune(runes, i) i-- } } return analysis.BuildTermFromRunes(runes) }
func normalize(input []byte) []byte { runes := bytes.Runes(input) for i := 0; i < len(runes); i++ { switch runes[i] { case FarsiYeh, YehBarree: runes[i] = Yeh case Keheh: runes[i] = Kaf case HehYeh, HehGoal: runes[i] = Heh case HamzaAbove: // necessary for HEH + HAMZA runes = analysis.DeleteRune(runes, i) i-- } } return analysis.BuildTermFromRunes(runes) }
func normalize(input []byte) []byte { state := N runes := bytes.Runes(input) for i := 0; i < len(runes); i++ { switch runes[i] { case 'a', 'o': state = U case 'u': if state == N { state = U } else { state = V } case 'e': if state == U { runes = analysis.DeleteRune(runes, i) i-- } state = V case 'i', 'q', 'y': state = V case 'ä': runes[i] = 'a' state = V case 'ö': runes[i] = 'o' state = V case 'ü': runes[i] = 'u' state = V case 'ß': runes[i] = 's' i++ // newrunes := make([]rune, len(runes)+1) // copy(newrunes, runes) // runes = newrunes // runes[i] = 's' runes = analysis.InsertRune(runes, i, 's') state = N default: state = N } } return analysis.BuildTermFromRunes(runes) }
func normalize(input []byte) []byte { runes := bytes.Runes(input) for i := 0; i < len(runes); i++ { switch runes[i] { case Yeh, DotlessYeh: runes[i] = FarsiYeh case Kaf: runes[i] = Keheh case Zwnj: if i > 0 && runes[i-1] == Heh { runes[i-1] = Ae } runes = analysis.DeleteRune(runes, i) i-- case Heh: if i == len(runes)-1 { runes[i] = Ae } case TehMarbuta: runes[i] = Ae case HehDoachashmee: runes[i] = Heh case Reh: if i == 0 { runes[i] = Rreh } case RrehAbove: runes[i] = Rreh case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun: runes = analysis.DeleteRune(runes, i) i-- default: if unicode.In(runes[i], unicode.Cf) { runes = analysis.DeleteRune(runes, i) i-- } } } return analysis.BuildTermFromRunes(runes) }
func normalize(input []byte) []byte { runes := bytes.Runes(input) for i := 0; i < len(runes); i++ { switch runes[i] { // dead n -> bindu case '\u0928': if i+1 < len(runes) && runes[i+1] == '\u094D' { runes[i] = '\u0902' runes = analysis.DeleteRune(runes, i+1) } // candrabindu -> bindu case '\u0901': runes[i] = '\u0902' // nukta deletions case '\u093C': runes = analysis.DeleteRune(runes, i) i-- case '\u0929': runes[i] = '\u0928' case '\u0931': runes[i] = '\u0930' case '\u0934': runes[i] = '\u0933' case '\u0958': runes[i] = '\u0915' case '\u0959': runes[i] = '\u0916' case '\u095A': runes[i] = '\u0917' case '\u095B': runes[i] = '\u091C' case '\u095C': runes[i] = '\u0921' case '\u095D': runes[i] = '\u0922' case '\u095E': runes[i] = '\u092B' case '\u095F': runes[i] = '\u092F' // zwj/zwnj -> delete case '\u200D', '\u200C': runes = analysis.DeleteRune(runes, i) i-- // virama -> delete case '\u094D': runes = analysis.DeleteRune(runes, i) i-- // chandra/short -> replace case '\u0945', '\u0946': runes[i] = '\u0947' case '\u0949', '\u094A': runes[i] = '\u094B' case '\u090D', '\u090E': runes[i] = '\u090F' case '\u0911', '\u0912': runes[i] = '\u0913' case '\u0972': runes[i] = '\u0905' // long -> short ind. vowels case '\u0906': runes[i] = '\u0905' case '\u0908': runes[i] = '\u0907' case '\u090A': runes[i] = '\u0909' case '\u0960': runes[i] = '\u090B' case '\u0961': runes[i] = '\u090C' case '\u0910': runes[i] = '\u090F' case '\u0914': runes[i] = '\u0913' // long -> short dep. vowels case '\u0940': runes[i] = '\u093F' case '\u0942': runes[i] = '\u0941' case '\u0944': runes[i] = '\u0943' case '\u0963': runes[i] = '\u0962' case '\u0948': runes[i] = '\u0947' case '\u094C': runes[i] = '\u094B' } } return analysis.BuildTermFromRunes(runes) }
func buildTokenFromTerm(buffer []rune) *analysis.Token { return &analysis.Token{ Term: analysis.BuildTermFromRunes(buffer), } }