func norm(input []rune) []rune { inputLen := len(input) if inputLen > 4 { for i := 0; i < inputLen; i++ { switch input[i] { case 'à', 'á', 'â': input[i] = 'a' case 'ô': input[i] = 'o' case 'è', 'é', 'ê': input[i] = 'e' case 'ù', 'û': input[i] = 'u' case 'î': input[i] = 'i' case 'ç': input[i] = 'c' } ch := input[0] for i := 1; i < inputLen; i++ { if input[i] == ch && unicode.IsLetter(ch) { input = analysis.DeleteRune(input, i) i -= 1 inputLen = len(input) } else { ch = input[i] } } } } if inputLen > 4 && analysis.RunesEndsWith(input, "ie") { input = input[0 : inputLen-2] inputLen = len(input) } if inputLen > 4 { if input[inputLen-1] == 'r' { input = input[0 : inputLen-1] inputLen = len(input) } if input[inputLen-1] == 'e' { input = input[0 : inputLen-1] inputLen = len(input) } if input[inputLen-1] == 'e' { input = input[0 : inputLen-1] inputLen = len(input) } if input[inputLen-1] == input[inputLen-2] && unicode.IsLetter(input[inputLen-1]) { input = input[0 : inputLen-1] inputLen = len(input) } } return input }
func normalize(input []byte) []byte { runes := bytes.Runes(input) for i := 0; i < len(runes); i++ { switch runes[i] { case Yeh, DotlessYeh: runes[i] = FarsiYeh case Kaf: runes[i] = Keheh case Zwnj: if i > 0 && runes[i-1] == Heh { runes[i-1] = Ae } runes = analysis.DeleteRune(runes, i) i-- case Heh: if i == len(runes)-1 { runes[i] = Ae } case TehMarbuta: runes[i] = Ae case HehDoachashmee: runes[i] = Heh case Reh: if i == 0 { runes[i] = Rreh } case RrehAbove: runes[i] = Rreh case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun: runes = analysis.DeleteRune(runes, i) i-- default: if unicode.In(runes[i], unicode.Cf) { runes = analysis.DeleteRune(runes, i) i-- } } } return analysis.BuildTermFromRunes(runes) }
func compose(ch0 rune, script0 *unicode.RangeTable, scriptData *ScriptData, input []rune, pos int, inputLen int) []rune { if pos+1 >= inputLen { return input // need at least 2 characters } ch1 := input[pos+1] - scriptData.base script1 := lookupScript(input[pos+1]) if script0 != script1 { return input // need to be same script } ch2 := rune(-1) if pos+2 < inputLen { ch2 = input[pos+2] - scriptData.base script2 := lookupScript(input[pos+2]) if input[pos+2] == '\u200D' { ch2 = 0xff // zero width joiner } else if script2 != script1 { ch2 = -1 // still allow 2 character match } } for _, decomposition := range decompositions { if decomposition[0] == ch0 && (decomposition[4]&scriptData.flag) != 0 { if decomposition[1] == ch1 && (decomposition[2] < 0 || decomposition[2] == ch2) { input[pos] = scriptData.base + decomposition[3] input = analysis.DeleteRune(input, pos+1) if decomposition[2] >= 0 { input = analysis.DeleteRune(input, pos+1) } return input } } } return input }
func normalize(input []byte) []byte { runes := bytes.Runes(input) for i := 0; i < len(runes); i++ { switch runes[i] { case AlefMadda, AlefHamzaAbove, AlefHamzaBelow: runes[i] = Alef case DotlessYeh: runes[i] = Yeh case TehMarbuta: runes[i] = Heh case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun: runes = analysis.DeleteRune(runes, i) i-- } } return analysis.BuildTermFromRunes(runes) }
func normalize(input []byte) []byte { runes := bytes.Runes(input) for i := 0; i < len(runes); i++ { switch runes[i] { case FarsiYeh, YehBarree: runes[i] = Yeh case Keheh: runes[i] = Kaf case HehYeh, HehGoal: runes[i] = Heh case HamzaAbove: // necessary for HEH + HAMZA runes = analysis.DeleteRune(runes, i) i-- } } return analysis.BuildTermFromRunes(runes) }
func normalize(input []byte) []byte { runes := bytes.Runes(input) for i := 0; i < len(runes); i++ { switch runes[i] { // dead n -> bindu case '\u0928': if i+1 < len(runes) && runes[i+1] == '\u094D' { runes[i] = '\u0902' runes = analysis.DeleteRune(runes, i+1) } // candrabindu -> bindu case '\u0901': runes[i] = '\u0902' // nukta deletions case '\u093C': runes = analysis.DeleteRune(runes, i) i-- case '\u0929': runes[i] = '\u0928' case '\u0931': runes[i] = '\u0930' case '\u0934': runes[i] = '\u0933' case '\u0958': runes[i] = '\u0915' case '\u0959': runes[i] = '\u0916' case '\u095A': runes[i] = '\u0917' case '\u095B': runes[i] = '\u091C' case '\u095C': runes[i] = '\u0921' case '\u095D': runes[i] = '\u0922' case '\u095E': runes[i] = '\u092B' case '\u095F': runes[i] = '\u092F' // zwj/zwnj -> delete case '\u200D', '\u200C': runes = analysis.DeleteRune(runes, i) i-- // virama -> delete case '\u094D': runes = analysis.DeleteRune(runes, i) i-- // chandra/short -> replace case '\u0945', '\u0946': runes[i] = '\u0947' case '\u0949', '\u094A': runes[i] = '\u094B' case '\u090D', '\u090E': runes[i] = '\u090F' case '\u0911', '\u0912': runes[i] = '\u0913' case '\u0972': runes[i] = '\u0905' // long -> short ind. vowels case '\u0906': runes[i] = '\u0905' case '\u0908': runes[i] = '\u0907' case '\u090A': runes[i] = '\u0909' case '\u0960': runes[i] = '\u090B' case '\u0961': runes[i] = '\u090C' case '\u0910': runes[i] = '\u090F' case '\u0914': runes[i] = '\u0913' // long -> short dep. vowels case '\u0940': runes[i] = '\u093F' case '\u0942': runes[i] = '\u0941' case '\u0944': runes[i] = '\u0943' case '\u0963': runes[i] = '\u0962' case '\u0948': runes[i] = '\u0947' case '\u094C': runes[i] = '\u094B' } } return analysis.BuildTermFromRunes(runes) }