func (s *PortugueseLightStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { for _, token := range input { runes := bytes.Runes(token.Term) runes = stem(runes) token.Term = analysis.BuildTermFromRunes(runes) } return input }
func (s *IndicNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { for _, token := range input { runes := bytes.Runes(token.Term) runes = normalize(runes) token.Term = analysis.BuildTermFromRunes(runes) } return input }
func (s *FrenchMinimalStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { for _, token := range input { runes := bytes.Runes(token.Term) runes = minstem(runes) token.Term = analysis.BuildTermFromRunes(runes) } return input }
func normalize(input []byte) []byte { runes := bytes.Runes(input) for i := 0; i < len(runes); i++ { switch runes[i] { case AlefMadda, AlefHamzaAbove, AlefHamzaBelow: runes[i] = Alef case DotlessYeh: runes[i] = Yeh case TehMarbuta: runes[i] = Heh case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun: runes = analysis.DeleteRune(runes, i) i-- } } return analysis.BuildTermFromRunes(runes) }
func normalize(input []byte) []byte { runes := bytes.Runes(input) for i := 0; i < len(runes); i++ { switch runes[i] { case FarsiYeh, YehBarree: runes[i] = Yeh case Keheh: runes[i] = Kaf case HehYeh, HehGoal: runes[i] = Heh case HamzaAbove: // necessary for HEH + HAMZA runes = analysis.DeleteRune(runes, i) i-- } } return analysis.BuildTermFromRunes(runes) }
func stem(input []byte) []byte { runes := bytes.Runes(input) // Strip a single prefix. for _, p := range prefixes { if canStemPrefix(runes, p) { runes = runes[len(p):] break } } // Strip off multiple suffixes, in their order in the suffixes array. for _, s := range suffixes { if canStemSuffix(runes, s) { runes = runes[:len(runes)-len(s)] } } return analysis.BuildTermFromRunes(runes) }
func normalize(input []byte) []byte { runes := bytes.Runes(input) for i := 0; i < len(runes); i++ { switch runes[i] { case Yeh, DotlessYeh: runes[i] = FarsiYeh case Kaf: runes[i] = Keheh case Zwnj: if i > 0 && runes[i-1] == Heh { runes[i-1] = Ae } runes = analysis.DeleteRune(runes, i) i-- case Heh: if i == len(runes)-1 { runes[i] = Ae } case TehMarbuta: runes[i] = Ae case HehDoachashmee: runes[i] = Heh case Reh: if i == 0 { runes[i] = Rreh } case RrehAbove: runes[i] = Rreh case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun: runes = analysis.DeleteRune(runes, i) i-- default: if unicode.In(runes[i], unicode.Cf) { runes = analysis.DeleteRune(runes, i) i-- } } } return analysis.BuildTermFromRunes(runes) }
func normalize(input []byte) []byte { runes := bytes.Runes(input) for i := 0; i < len(runes); i++ { switch runes[i] { // dead n -> bindu case '\u0928': if i+1 < len(runes) && runes[i+1] == '\u094D' { runes[i] = '\u0902' runes = analysis.DeleteRune(runes, i+1) } // candrabindu -> bindu case '\u0901': runes[i] = '\u0902' // nukta deletions case '\u093C': runes = analysis.DeleteRune(runes, i) i-- case '\u0929': runes[i] = '\u0928' case '\u0931': runes[i] = '\u0930' case '\u0934': runes[i] = '\u0933' case '\u0958': runes[i] = '\u0915' case '\u0959': runes[i] = '\u0916' case '\u095A': runes[i] = '\u0917' case '\u095B': runes[i] = '\u091C' case '\u095C': runes[i] = '\u0921' case '\u095D': runes[i] = '\u0922' case '\u095E': runes[i] = '\u092B' case '\u095F': runes[i] = '\u092F' // zwj/zwnj -> delete case '\u200D', '\u200C': runes = analysis.DeleteRune(runes, i) i-- // virama -> delete case '\u094D': runes = analysis.DeleteRune(runes, i) i-- // chandra/short -> replace case '\u0945', '\u0946': runes[i] = '\u0947' case '\u0949', '\u094A': runes[i] = '\u094B' case '\u090D', '\u090E': runes[i] = '\u090F' case '\u0911', '\u0912': runes[i] = '\u0913' case '\u0972': runes[i] = '\u0905' // long -> short ind. vowels case '\u0906': runes[i] = '\u0905' case '\u0908': runes[i] = '\u0907' case '\u090A': runes[i] = '\u0909' case '\u0960': runes[i] = '\u090B' case '\u0961': runes[i] = '\u090C' case '\u0910': runes[i] = '\u090F' case '\u0914': runes[i] = '\u0913' // long -> short dep. vowels case '\u0940': runes[i] = '\u093F' case '\u0942': runes[i] = '\u0941' case '\u0944': runes[i] = '\u0943' case '\u0963': runes[i] = '\u0962' case '\u0948': runes[i] = '\u0947' case '\u094C': runes[i] = '\u094B' } } return analysis.BuildTermFromRunes(runes) }