// Remove verb endings and return true if one was removed. // func removeVerbEnding(word *snowballword.SnowballWord) bool { suffix, suffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS), "уйте", "ейте", "ыть", "ыло", "ыли", "ыла", "уют", "ует", "нно", "йте", "ишь", "ить", "ите", "ило", "или", "ила", "ешь", "ете", "ены", "ено", "ена", "ят", "ют", "ыт", "ым", "ыл", "ую", "уй", "ть", "ны", "но", "на", "ло", "ли", "ла", "ит", "им", "ил", "ет", "ен", "ем", "ей", "ю", "н", "л", "й", ) switch suffix { case "ла", "на", "ете", "йте", "ли", "й", "л", "ем", "н", "ло", "но", "ет", "ют", "ны", "ть", "ешь", "нно": // These are "Group 1" verb endings. // Group 1 endings must follow а (a) or я (ia) in RV. if precededByARinRV(word, len(suffixRunes)) == false { suffix = "" } } if suffix != "" { word.RemoveLastNRunes(len(suffixRunes)) return true } return false }
// Find the starting point of the regions R1, R2, & RV // func findRegions(word *snowballword.SnowballWord) (r1start, r2start, rvstart int) { // R1 & R2 are defined in the standard manner. r1start = romance.VnvSuffix(word, isLowerVowel, 0) r2start = romance.VnvSuffix(word, isLowerVowel, r1start) // Set RV, by default, as empty. rvstart = len(word.RS) // Handle the three special cases: "par", "col", & "tap" // prefix, prefixRunes := word.FirstPrefix("par", "col", "tap") if prefix != "" { rvstart = len(prefixRunes) return } // If the word begins with two vowels, RV is the region after the third letter if len(word.RS) >= 3 && isLowerVowel(word.RS[0]) && isLowerVowel(word.RS[1]) { rvstart = 3 return } // Otherwise the region after the first vowel not at the beginning of the word. for i := 1; i < len(word.RS); i++ { if isLowerVowel(word.RS[i]) { rvstart = i + 1 return } } return }
// Step 2 is the removal of the "и" suffix. // func step2(word *snowballword.SnowballWord) bool { suffix, _ := word.RemoveFirstSuffixIn(word.RVstart, "и") if suffix != "" { return true } return false }
// Step 5 Undouble non-vowel endings // func step5(word *snowballword.SnowballWord) bool { suffix, _ := word.FirstSuffix("enn", "onn", "ett", "ell", "eill") if suffix != "" { word.RemoveLastNRunes(1) } return false }
func preprocess(word *snowballword.SnowballWord) { r1start, r2start, rvstart := findRegions(word) word.R1start = r1start word.R2start = r2start word.RVstart = rvstart }
// Step 0 is to strip off apostrophes and "s". // func step0(w *snowballword.SnowballWord) bool { suffix, suffixRunes := w.FirstSuffix("'s'", "'s", "'") if suffix == "" { return false } w.RemoveLastNRunes(len(suffixRunes)) return true }
// Step 2a is the removal of verb suffixes beginning y, // Search for the longest among the following suffixes // in RV, and if found, delete if preceded by u. // func step2a(word *snowballword.SnowballWord) bool { suffix, suffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS), "ya", "ye", "yan", "yen", "yeron", "yendo", "yo", "yó", "yas", "yes", "yais", "yamos") if suffix != "" { idx := len(word.RS) - len(suffixRunes) - 1 if idx >= 0 && word.RS[idx] == 117 { word.RemoveLastNRunes(len(suffixRunes)) return true } } return false }
// Step 3 is the removal of the derivational suffix. // func step3(word *snowballword.SnowballWord) bool { // Search for a DERIVATIONAL ending in R2 (i.e. the entire // ending must lie in R2), and if one is found, remove it. suffix, _ := word.RemoveFirstSuffixIn(word.R2start, "ост", "ость") if suffix != "" { return true } return false }
// Find the starting point of the two regions R1 & R2. // // R1 is the region after the first non-vowel following a vowel, // or is the null region at the end of the word if there is no // such non-vowel. // // R2 is the region after the first non-vowel following a vowel // in R1, or is the null region at the end of the word if there // is no such non-vowel. // // See http://snowball.tartarus.org/texts/r1r2.html // func r1r2(word *snowballword.SnowballWord) (r1start, r2start int) { specialPrefix, _ := word.FirstPrefix("gener", "commun", "arsen") if specialPrefix != "" { r1start = len(specialPrefix) } else { r1start = romance.VnvSuffix(word, isLowerVowel, 0) } r2start = romance.VnvSuffix(word, isLowerVowel, r1start) return }
// Step 3 is the removal of residual suffixes. // func step3(word *snowballword.SnowballWord) bool { suffix, suffixRunes := word.FirstSuffixIfIn(word.RVstart, len(word.RS), "os", "a", "o", "á", "í", "ó", "e", "é", ) // No suffix found, nothing to do. // if suffix == "" { return false } // Remove all these suffixes word.RemoveLastNRunes(len(suffixRunes)) if suffix == "e" || suffix == "é" { // If preceded by gu with the u in RV delete the u // guSuffix, _ := word.FirstSuffix("gu") if guSuffix != "" { word.RemoveLastNRunes(1) } } return true }
// Applies various transformations necessary for the // other, subsequent stemming steps. Most important // of which is defining the two regions R1 & R2. // func preprocess(word *snowballword.SnowballWord) { // Clean up apostrophes normalizeApostrophes(word) trimLeftApostrophes(word) // Capitalize Y's that are not behaving // as vowels. capitalizeYs(word) // Find the two regions, R1 & R2 r1start, r2start := r1r2(word) word.R1start = r1start word.R2start = r2start }
// Trim off leading apostropes. (Slight variation from // NLTK implementation here, in which only the first is removed.) // func trimLeftApostrophes(word *snowballword.SnowballWord) { var ( numApostrophes int r rune ) for numApostrophes, r = range word.RS { // Check for "'", which is unicode code point 39 if r != 39 { break } } if numApostrophes > 0 { word.RS = word.RS[numApostrophes:] word.R1start = word.R1start - numApostrophes word.R2start = word.R2start - numApostrophes } }
// Remove perfective gerund endings and return true if one was removed. // func removePerfectiveGerundEnding(word *snowballword.SnowballWord) bool { suffix, suffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS), "ившись", "ывшись", "вшись", "ивши", "ывши", "вши", "ив", "ыв", "в", ) switch suffix { case "в", "вши", "вшись": // These are "Group 1" perfective gerund endings. // Group 1 endings must follow а (a) or я (ia) in RV. if precededByARinRV(word, len(suffixRunes)) == false { suffix = "" } } if suffix != "" { word.RemoveLastNRunes(len(suffixRunes)) return true } return false }
// Remove adjectival endings and return true if one was removed. // func removeAdjectivalEnding(word *snowballword.SnowballWord) bool { // Remove adjectival endings. Start by looking for // an adjective ending. // suffix, _ := word.RemoveFirstSuffixIn(word.RVstart, "ими", "ыми", "его", "ого", "ему", "ому", "ее", "ие", "ые", "ое", "ей", "ий", "ый", "ой", "ем", "им", "ым", "ом", "их", "ых", "ую", "юю", "ая", "яя", "ою", "ею", ) if suffix != "" { // We found an adjective ending. Remove optional participle endings. // newSuffix, newSuffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS), "ивш", "ывш", "ующ", "ем", "нн", "вш", "ющ", "щ", ) switch newSuffix { case "ем", "нн", "вш", "ющ", "щ": // These are "Group 1" participle endings. // Group 1 endings must follow а (a) or я (ia) in RV. if precededByARinRV(word, len(newSuffixRunes)) == false { newSuffix = "" } } if newSuffix != "" { word.RemoveLastNRunes(len(newSuffixRunes)) } return true } return false }
// Step 1 is the removal of standard suffixes, all of which must // occur in RV. // // // Search for a PERFECTIVE GERUND ending. If one is found remove it, and // that is then the end of step 1. Otherwise try and remove a REFLEXIVE // ending, and then search in turn for (1) an ADJECTIVAL, (2) a VERB or // (3) a NOUN ending. As soon as one of the endings (1) to (3) is found // remove it, and terminate step 1. // func step1(word *snowballword.SnowballWord) bool { // `stop` will be used to signal early termination var stop bool // Search for a PERFECTIVE GERUND ending stop = removePerfectiveGerundEnding(word) if stop { return true } // Next remove reflexive endings word.RemoveFirstSuffixIn(word.RVstart, "ся", "сь") // Next remove adjectival endings stop = removeAdjectivalEnding(word) if stop { return true } // Next remove verb endings stop = removeVerbEnding(word) if stop { return true } // Next remove noun endings suffix, _ := word.RemoveFirstSuffixIn(word.RVstart, "иями", "ями", "иях", "иям", "ием", "ией", "ами", "ях", "ям", "ья", "ью", "ье", "ом", "ой", "ов", "ия", "ию", "ий", "ии", "ие", "ем", "ей", "еи", "ев", "ах", "ам", "я", "ю", "ь", "ы", "у", "о", "й", "и", "е", "а", ) if suffix != "" { return true } return false }
// Step 5 is the stemming of "e" and "l" sufficies // found in R2. // func step5(w *snowballword.SnowballWord) bool { // Last rune index = `lri` lri := len(w.RS) - 1 // If R1 is emtpy, R2 is also empty, and we // need not do anything in step 5. // if w.R1start > lri { return false } if w.RS[lri] == 101 { // The word ends with "e", which is unicode code point 101. // Delete "e" suffix if in R2, or in R1 and not preceded // by a short syllable. if w.R2start <= lri || !endsShortSyllable(w, lri) { w.ReplaceSuffix("e", "", true) return true } return false } else if w.R2start <= lri && w.RS[lri] == 108 && lri-1 >= 0 && w.RS[lri-1] == 108 { // The word ends in double "l", and the final "l" is // in R2. (Note, the unicode code point for "l" is 108.) // Delete the second "l". w.ReplaceSuffix("l", "", true) return true } return false }
// Step 2b is the removal of verb suffixes beginning y, // Search for the longest among the following suffixes // in RV, and if found, delete if preceded by u. // func step2b(word *snowballword.SnowballWord) bool { suffix, suffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS), "iésemos", "iéramos", "iríamos", "eríamos", "aríamos", "ásemos", "áramos", "ábamos", "isteis", "iríais", "iremos", "ieseis", "ierais", "eríais", "eremos", "asteis", "aríais", "aremos", "íamos", "irías", "irían", "iréis", "ieses", "iesen", "ieron", "ieras", "ieran", "iendo", "erías", "erían", "eréis", "aseis", "arías", "arían", "aréis", "arais", "abais", "íais", "iste", "iría", "irás", "irán", "imos", "iese", "iera", "idos", "idas", "ería", "erás", "erán", "aste", "ases", "asen", "aría", "arás", "arán", "aron", "aras", "aran", "ando", "amos", "ados", "adas", "abas", "aban", "ías", "ían", "éis", "áis", "iré", "irá", "ido", "ida", "eré", "erá", "emos", "ase", "aré", "ará", "ara", "ado", "ada", "aba", "ís", "ía", "ió", "ir", "id", "es", "er", "en", "ed", "as", "ar", "an", "ad", ) switch suffix { case "": return false case "en", "es", "éis", "emos": // Delete, and if preceded by gu delete the u (the gu need not be in RV) word.RemoveLastNRunes(len(suffixRunes)) guSuffix, _ := word.FirstSuffix("gu") if guSuffix != "" { word.RemoveLastNRunes(1) } default: // Delete word.RemoveLastNRunes(len(suffixRunes)) } return true }
// Step 1a is normalization of various special "s"-endings. // func step1a(w *snowballword.SnowballWord) bool { suffix, suffixRunes := w.FirstSuffix("sses", "ied", "ies", "us", "ss", "s") switch suffix { case "sses": // Replace by ss w.ReplaceSuffixRunes(suffixRunes, []rune("ss"), true) return true case "ies", "ied": // Replace by i if preceded by more than one letter, // otherwise by ie (so ties -> tie, cries -> cri). var repl string if len(w.RS) > 4 { repl = "i" } else { repl = "ie" } w.ReplaceSuffixRunes(suffixRunes, []rune(repl), true) return true case "us", "ss": // Do nothing return false case "s": // Delete if the preceding word part contains a vowel // not immediately before the s (so gas and this retain // the s, gaps and kiwis lose it) // for i := 0; i < len(w.RS)-2; i++ { if isLowerVowel(w.RS[i]) { w.RemoveLastNRunes(len(suffixRunes)) return true } } } return false }
// Step 3 is the stemming of various longer sufficies // found in R1. // func step3(w *snowballword.SnowballWord) bool { suffix, suffixRunes := w.FirstSuffix( "ational", "tional", "alize", "icate", "ative", "iciti", "ical", "ful", "ness", ) // If it is not in R1, do nothing if suffix == "" || len(suffixRunes) > len(w.RS)-w.R1start { return false } // Handle special cases where we're not just going to // replace the suffix with another suffix: there are // other things we need to do. // if suffix == "ative" { // If in R2, delete. // if len(w.RS)-w.R2start >= 5 { w.RemoveLastNRunes(len(suffixRunes)) return true } return false } // Handle a suffix that was found, which is going // to be replaced with a different suffix. // var repl string switch suffix { case "ational": repl = "ate" case "tional": repl = "tion" case "alize": repl = "al" case "icate", "iciti", "ical": repl = "ic" case "ful", "ness": repl = "" } w.ReplaceSuffixRunes(suffixRunes, []rune(repl), true) return true }
// al, ance, ence, er, ic, able, ible, ant, ement, ment, // ent, ism, ate, iti, ous, ive, ize // delete // // ion // delete if preceded by s or t func step4(w *snowballword.SnowballWord) bool { // Find all endings in R1 suffix, suffixRunes := w.FirstSuffix( "ement", "ance", "ence", "able", "ible", "ment", "ent", "ant", "ism", "ate", "iti", "ous", "ive", "ize", "ion", "al", "er", "ic", ) // If it does not fit in R2, do nothing. if len(suffixRunes) > len(w.RS)-w.R2start { return false } // Handle special cases switch suffix { case "": return false case "ion": // Replace by og if preceded by l // l = 108 rsLen := len(w.RS) if rsLen >= 4 { switch w.RS[rsLen-4] { case 115, 116: w.RemoveLastNRunes(len(suffixRunes)) return true } } return false } // Handle basic replacements w.RemoveLastNRunes(len(suffixRunes)) return true }
// Step 2a is the removal of Verb suffixes beginning // with "i" in the RV region. // func step2a(word *snowballword.SnowballWord) bool { // Search for the longest among the following suffixes // in RV and if found, delete if preceded by a non-vowel. suffix, suffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS), "issantes", "issaIent", "issions", "issants", "issante", "iraIent", "issons", "issiez", "issent", "issant", "issait", "issais", "irions", "issez", "isses", "iront", "irons", "iriez", "irent", "irait", "irais", "îtes", "îmes", "isse", "irez", "iras", "irai", "ira", "ies", "ît", "it", "is", "ir", "ie", "i", ) if suffix != "" { sLen := len(suffixRunes) idx := len(word.RS) - sLen - 1 if idx >= 0 && word.FitsInRV(sLen+1) && isLowerVowel(word.RS[idx]) == false { word.RemoveLastNRunes(len(suffixRunes)) return true } } return false }
// Step 1 is the removal of standard suffixes // func step1(word *snowballword.SnowballWord) bool { // Possible suffixes, longest first suffix, suffixRunes := word.FirstSuffix( "amientos", "imientos", "aciones", "amiento", "imiento", "uciones", "logías", "idades", "encias", "ancias", "amente", "adores", "adoras", "ución", "mente", "logía", "istas", "ismos", "ibles", "encia", "anzas", "antes", "ancia", "adora", "ación", "ables", "osos", "osas", "ivos", "ivas", "ista", "ismo", "idad", "icos", "icas", "ible", "anza", "ante", "ador", "able", "oso", "osa", "ivo", "iva", "ico", "ica", ) isInR1 := (word.R1start <= len(word.RS)-len(suffixRunes)) isInR2 := (word.R2start <= len(word.RS)-len(suffixRunes)) // Deal with special cases first. All of these will // return if they are hit. // switch suffix { case "": // Nothing to do return false case "amente": if isInR1 { // Delete if in R1 word.RemoveLastNRunes(len(suffixRunes)) // if preceded by iv, delete if in R2 (and if further preceded by at, // delete if in R2), otherwise, // if preceded by os, ic or ad, delete if in R2 newSuffix, _ := word.RemoveFirstSuffixIfIn(word.R2start, "iv", "os", "ic", "ad") if newSuffix == "iv" { word.RemoveFirstSuffixIfIn(word.R2start, "at") } return true } return false } // All the following cases require the found suffix // to be in R2. if isInR2 == false { return false } // Compound replacement cases. All these cases return // if they are hit. // compoundReplacement := func(otherSuffixes ...string) bool { word.RemoveLastNRunes(len(suffixRunes)) word.RemoveFirstSuffixIfIn(word.R2start, otherSuffixes...) return true } switch suffix { case "adora", "ador", "ación", "adoras", "adores", "aciones", "ante", "antes", "ancia", "ancias": return compoundReplacement("ic") case "mente": return compoundReplacement("ante", "able", "ible") case "idad", "idades": return compoundReplacement("abil", "ic", "iv") case "iva", "ivo", "ivas", "ivos": return compoundReplacement("at") } // Simple replacement & deletion cases are all that remain. // simpleReplacement := func(repl string) bool { word.ReplaceSuffixRunes(suffixRunes, []rune(repl), true) return true } switch suffix { case "logía", "logías": return simpleReplacement("log") case "ución", "uciones": return simpleReplacement("u") case "encia", "encias": return simpleReplacement("ente") case "anza", "anzas", "ico", "ica", "icos", "icas", "ismo", "ismos", "able", "ables", "ible", "ibles", "ista", "istas", "oso", "osa", "osos", "osas", "amiento", "amientos", "imiento", "imientos": word.RemoveLastNRunes(len(suffixRunes)) return true } log.Panicln("Unhandled suffix:", suffix) return false }
// Step 1b is the normalization of various "ly" and "ed" sufficies. // func step1b(w *snowballword.SnowballWord) bool { suffix, suffixRunes := w.FirstSuffix("eedly", "ingly", "edly", "ing", "eed", "ed") switch suffix { case "": // No suffix found return false case "eed", "eedly": // Replace by ee if in R1 if len(suffixRunes) <= len(w.RS)-w.R1start { w.ReplaceSuffixRunes(suffixRunes, []rune("ee"), true) } return true case "ed", "edly", "ing", "ingly": hasLowerVowel := false for i := 0; i < len(w.RS)-len(suffixRunes); i++ { if isLowerVowel(w.RS[i]) { hasLowerVowel = true break } } if hasLowerVowel { // This case requires a two-step transformation and, due // to the way we've implemented the `ReplaceSuffix` method // here, information about R1 and R2 would be lost between // the two. Therefore, we need to keep track of the // original R1 & R2, so that we may set them below, at the // end of this case. // originalR1start := w.R1start originalR2start := w.R2start // Delete if the preceding word part contains a vowel w.RemoveLastNRunes(len(suffixRunes)) // ...and after the deletion... newSuffix, newSuffixRunes := w.FirstSuffix("at", "bl", "iz", "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt") switch newSuffix { case "": // If the word is short, add "e" if isShortWord(w) { // By definition, r1 and r2 are the empty string for // short words. w.RS = append(w.RS, []rune("e")...) w.R1start = len(w.RS) w.R2start = len(w.RS) return true } case "at", "bl", "iz": // If the word ends "at", "bl" or "iz" add "e" w.ReplaceSuffixRunes(newSuffixRunes, []rune(newSuffix+"e"), true) case "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt": // If the word ends with a double remove the last letter. // Note that, "double" does not include all possible doubles, // just those shown above. // w.RemoveLastNRunes(1) } // Because we did a double replacement, we need to fix // R1 and R2 manually. This is just becase of how we've // implemented the `ReplaceSuffix` method. // rsLen := len(w.RS) if originalR1start < rsLen { w.R1start = originalR1start } else { w.R1start = rsLen } if originalR2start < rsLen { w.R2start = originalR2start } else { w.R2start = rsLen } return true } } return false }
// Step 4 is the undoubling of double non-vowel endings // and removal of superlative endings. // func step4(word *snowballword.SnowballWord) bool { // (1) Undouble "н", or, 2) if the word ends with a SUPERLATIVE ending, // (remove it and undouble н n), or 3) if the word ends ь (') (soft sign) // remove it. // Undouble "н" if word.HasSuffixRunes([]rune("нн")) { word.RemoveLastNRunes(1) return true } // Remove superlative endings suffix, _ := word.RemoveFirstSuffix("ейше", "ейш") if suffix != "" { // Undouble "н" if word.HasSuffixRunes([]rune("нн")) { word.RemoveLastNRunes(1) } return true } // Remove soft sign if rsLen := len(word.RS); rsLen > 0 && word.RS[rsLen-1] == 'ь' { word.RemoveLastNRunes(1) return true } return false }
// Step 2 is the stemming of various endings found in // R1 including "al", "ness", and "li". // func step2(w *snowballword.SnowballWord) bool { // Possible sufficies for this step, longest first. suffix, suffixRunes := w.FirstSuffix( "ational", "fulness", "iveness", "ization", "ousness", "biliti", "lessli", "tional", "alism", "aliti", "ation", "entli", "fulli", "iviti", "ousli", "anci", "abli", "alli", "ator", "enci", "izer", "bli", "ogi", "li", ) // If it is not in R1, do nothing if suffix == "" || len(suffixRunes) > len(w.RS)-w.R1start { return false } // Handle special cases where we're not just going to // replace the suffix with another suffix: there are // other things we need to do. // switch suffix { case "li": // Delete if preceded by a valid li-ending. Valid li-endings inlude the // following charaters: cdeghkmnrt. (Note, the unicode code points for // these characters are, respectively, as follows: // 99 100 101 103 104 107 109 110 114 116) // rsLen := len(w.RS) if rsLen >= 3 { switch w.RS[rsLen-3] { case 99, 100, 101, 103, 104, 107, 109, 110, 114, 116: w.RemoveLastNRunes(len(suffixRunes)) return true } } return false case "ogi": // Replace by og if preceded by l. // (Note, the unicode code point for l is 108) // rsLen := len(w.RS) if rsLen >= 4 && w.RS[rsLen-4] == 108 { w.ReplaceSuffixRunes(suffixRunes, []rune("og"), true) } return true } // Handle a suffix that was found, which is going // to be replaced with a different suffix. // var repl string switch suffix { case "tional": repl = "tion" case "enci": repl = "ence" case "anci": repl = "ance" case "abli": repl = "able" case "entli": repl = "ent" case "izer", "ization": repl = "ize" case "ational", "ation", "ator": repl = "ate" case "alism", "aliti", "alli": repl = "al" case "fulness": repl = "ful" case "ousli", "ousness": repl = "ous" case "iveness", "iviti": repl = "ive" case "biliti", "bli": repl = "ble" case "fulli": repl = "ful" case "lessli": repl = "less" } w.ReplaceSuffixRunes(suffixRunes, []rune(repl), true) return true }
// Step 2b is the removal of Verb suffixes in RV // that do not begin with "i". // func step2b(word *snowballword.SnowballWord) bool { // Search for the longest among the following suffixes in RV. // suffix, suffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS), "eraIent", "assions", "erions", "assiez", "assent", "èrent", "eront", "erons", "eriez", "erait", "erais", "asses", "antes", "aIent", "âtes", "âmes", "ions", "erez", "eras", "erai", "asse", "ants", "ante", "ées", "iez", "era", "ant", "ait", "ais", "és", "ée", "ât", "ez", "er", "as", "ai", "é", "a", ) switch suffix { case "ions": // Delete if in R2 suffixLen := len(suffixRunes) if word.FitsInR2(suffixLen) { word.RemoveLastNRunes(suffixLen) return true } return false case "é", "ée", "ées", "és", "èrent", "er", "era", "erai", "eraIent", "erais", "erait", "eras", "erez", "eriez", "erions", "erons", "eront", "ez", "iez": // Delete word.RemoveLastNRunes(len(suffixRunes)) return true case "âmes", "ât", "âtes", "a", "ai", "aIent", "ais", "ait", "ant", "ante", "antes", "ants", "as", "asse", "assent", "asses", "assiez", "assions": // Delete word.RemoveLastNRunes(len(suffixRunes)) // If preceded by e (unicode code point 101), delete // idx := len(word.RS) - 1 if idx >= 0 && word.RS[idx] == 101 && word.FitsInRV(1) { word.RemoveLastNRunes(1) } return true } return false }
func printDebug(debug bool, w *snowballword.SnowballWord) { if debug { log.Println(w.DebugString()) } }
// Step 4 is the cleaning up of residual suffixes. // func step4(word *snowballword.SnowballWord) bool { hadChange := false if word.String() == "voudrion" { log.Println("...", word) } // If the word ends s (unicode code point 115), // not preceded by a, i, o, u, è or s, delete it. // if idx := len(word.RS) - 1; idx >= 1 && word.RS[idx] == 115 { switch word.RS[idx-1] { case 97, 105, 111, 117, 232, 115: // Do nothing, preceded by a, i, o, u, è or s return false default: word.RemoveLastNRunes(1) hadChange = true } } // Note: all the following are restricted to the RV region. // Search for the longest among the following suffixes in RV. // suffix, suffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS), "Ière", "ière", "Ier", "ier", "ion", "e", "ë", ) switch suffix { case "": return hadChange case "ion": // Delete if in R2 and preceded by s or t in RV const sLen int = 3 // equivalently, len(suffixRunes) idx := len(word.RS) - sLen - 1 if word.FitsInR2(sLen) && idx >= 0 && word.FitsInRV(sLen+1) { if word.RS[idx] == 115 || word.RS[idx] == 116 { word.RemoveLastNRunes(sLen) return true } } return hadChange case "ier", "ière", "Ier", "Ière": // Replace with i word.ReplaceSuffixRunes(suffixRunes, []rune("i"), true) return true case "e": word.RemoveLastNRunes(1) return true case "ë": // If preceded by gu (unicode code point 103 & 117), delete idx := len(word.RS) - 1 if idx >= 2 && word.RS[idx-2] == 103 && word.RS[idx-1] == 117 { word.RemoveLastNRunes(1) return true } return hadChange } return true }
// Step 1 is the removal of standard suffixes // func step1(word *snowballword.SnowballWord) bool { suffix, suffixRunes := word.FirstSuffix( "issements", "issement", "atrices", "utions", "usions", "logies", "emment", "ements", "atrice", "ations", "ateurs", "amment", "ution", "usion", "ments", "logie", "istes", "ismes", "iqUes", "euses", "ences", "ement", "ation", "ateur", "ances", "ables", "ment", "ités", "iste", "isme", "iqUe", "euse", "ence", "eaux", "ance", "able", "ives", "ité", "eux", "aux", "ive", "ifs", "if", ) if suffix == "" { return false } isInR1 := (word.R1start <= len(word.RS)-len(suffixRunes)) isInR2 := (word.R2start <= len(word.RS)-len(suffixRunes)) isInRV := (word.RVstart <= len(word.RS)-len(suffixRunes)) // Handle simple replacements & deletions in R2 first if isInR2 { // Handle simple replacements in R2 repl := "" switch suffix { case "logie", "logies": repl = "log" case "usion", "ution", "usions", "utions": repl = "u" case "ence", "ences": repl = "ent" } if repl != "" { word.ReplaceSuffixRunes(suffixRunes, []rune(repl), true) return true } // Handle simple deletions in R2 switch suffix { case "ance", "iqUe", "isme", "able", "iste", "eux", "ances", "iqUes", "ismes", "ables", "istes": word.RemoveLastNRunes(len(suffixRunes)) return true } } // Handle simple replacements in RV if isInRV { // NOTE: these are "special" suffixes in that // we must still do steps 2a and 2b of the // French stemmer even when these suffixes are // found in step1. Therefore, we are returning // `false` here. repl := "" switch suffix { case "amment": repl = "ant" case "emment": repl = "ent" } if repl != "" { word.ReplaceSuffixRunes(suffixRunes, []rune(repl), true) return false } // Delete if preceded by a vowel that is also in RV if suffix == "ment" || suffix == "ments" { idx := len(word.RS) - len(suffixRunes) - 1 if idx >= word.RVstart && isLowerVowel(word.RS[idx]) { word.RemoveLastNRunes(len(suffixRunes)) return false } return false } } // Handle all the other "special" cases. All of these // return true immediately after changing the word. // switch suffix { case "eaux": // Replace with eau word.ReplaceSuffixRunes(suffixRunes, []rune("eau"), true) return true case "aux": // Replace with al if in R1 if isInR1 { word.ReplaceSuffixRunes(suffixRunes, []rune("al"), true) return true } case "euse", "euses": // Delete if in R2, else replace by eux if in R1 if isInR2 { word.RemoveLastNRunes(len(suffixRunes)) return true } else if isInR1 { word.ReplaceSuffixRunes(suffixRunes, []rune("eux"), true) return true } case "issement", "issements": // Delete if in R1 and preceded by a non-vowel if isInR1 { idx := len(word.RS) - len(suffixRunes) - 1 if idx >= 0 && isLowerVowel(word.RS[idx]) == false { word.RemoveLastNRunes(len(suffixRunes)) return true } } return false case "atrice", "ateur", "ation", "atrices", "ateurs", "ations": // Delete if in R2 if isInR2 { word.RemoveLastNRunes(len(suffixRunes)) // If preceded by "ic", delete if in R2, else replace by "iqU". newSuffix, newSuffixRunes := word.FirstSuffix("ic") if newSuffix != "" { if word.FitsInR2(len(newSuffixRunes)) { word.RemoveLastNRunes(len(newSuffixRunes)) } else { word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true) } } return true } case "ement", "ements": if isInRV { // Delete if in RV word.RemoveLastNRunes(len(suffixRunes)) // If preceded by "iv", delete if in R2 // (and if further preceded by "at", delete if in R2) newSuffix, newSuffixRunes := word.RemoveFirstSuffixIfIn(word.R2start, "iv") if newSuffix != "" { word.RemoveFirstSuffixIfIn(word.R2start, "at") return true } // If preceded by "eus", delete if in R2, else replace by "eux" if in R1 newSuffix, newSuffixRunes = word.FirstSuffix("eus") if newSuffix != "" { newSuffixLen := len(newSuffixRunes) if word.FitsInR2(newSuffixLen) { word.RemoveLastNRunes(newSuffixLen) } else if word.FitsInR1(newSuffixLen) { word.ReplaceSuffixRunes(newSuffixRunes, []rune("eux"), true) } return true } // If preceded by abl or iqU, delete if in R2, otherwise, newSuffix, newSuffixRunes = word.FirstSuffix("abl", "iqU") if newSuffix != "" { newSuffixLen := len(newSuffixRunes) if word.FitsInR2(newSuffixLen) { word.RemoveLastNRunes(newSuffixLen) } return true } // If preceded by ièr or Ièr, replace by i if in RV newSuffix, newSuffixRunes = word.FirstSuffix("ièr", "Ièr") if newSuffix != "" { if word.FitsInRV(len(newSuffixRunes)) { word.ReplaceSuffixRunes(newSuffixRunes, []rune("i"), true) } return true } return true } case "ité", "ités": if isInR2 { // Delete if in R2 word.RemoveLastNRunes(len(suffixRunes)) // If preceded by "abil", delete if in R2, else replace by "abl" newSuffix, newSuffixRunes := word.FirstSuffix("abil") if newSuffix != "" { newSuffixLen := len(newSuffixRunes) if word.FitsInR2(newSuffixLen) { word.RemoveLastNRunes(newSuffixLen) } else { word.ReplaceSuffixRunes(newSuffixRunes, []rune("abl"), true) } return true } // If preceded by "ic", delete if in R2, else replace by "iqU" newSuffix, newSuffixRunes = word.FirstSuffix("ic") if newSuffix != "" { newSuffixLen := len(newSuffixRunes) if word.FitsInR2(newSuffixLen) { word.RemoveLastNRunes(newSuffixLen) } else { word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true) } return true } // If preceded by "iv", delete if in R2 newSuffix, newSuffixRunes = word.RemoveFirstSuffixIfIn(word.R2start, "iv") return true } case "if", "ive", "ifs", "ives": if isInR2 { // Delete if in R2 word.RemoveLastNRunes(len(suffixRunes)) // If preceded by at, delete if in R2 newSuffix, newSuffixRunes := word.RemoveFirstSuffixIfIn(word.R2start, "at") if newSuffix != "" { // And if further preceded by ic, delete if in R2, else replace by iqU newSuffix, newSuffixRunes = word.FirstSuffix("ic") if newSuffix != "" { newSuffixLen := len(newSuffixRunes) if word.FitsInR2(newSuffixLen) { word.RemoveLastNRunes(newSuffixLen) } else { word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true) } } } return true } } return false }
// Step 0 is the removal of attached pronouns // func step0(word *snowballword.SnowballWord) bool { // Search for the longest among the following suffixes suffix1, suffix1Runes := word.FirstSuffixIn(word.RVstart, len(word.RS), "selas", "selos", "sela", "selo", "las", "les", "los", "nos", "me", "se", "la", "le", "lo", ) // If the suffix empty or not in RV, we have nothing to do. if suffix1 == "" { return false } // We'll remove suffix1, if comes after one of the following suffix2, suffix2Runes := word.FirstSuffixIn(word.RVstart, len(word.RS)-len(suffix1), "iéndo", "iendo", "yendo", "ando", "ándo", "ár", "ér", "ír", "ar", "er", "ir", ) switch suffix2 { case "": // Nothing to do return false case "iéndo", "ándo", "ár", "ér", "ír": // In these cases, deletion is followed by removing // the acute accent (e.g., haciéndola -> haciendo). var suffix2repl string switch suffix2 { case "": return false case "iéndo": suffix2repl = "iendo" case "ándo": suffix2repl = "ando" case "ár": suffix2repl = "ar" case "ír": suffix2repl = "ir" } word.RemoveLastNRunes(len(suffix1Runes)) word.ReplaceSuffixRunes(suffix2Runes, []rune(suffix2repl), true) return true case "ando", "iendo", "ar", "er", "ir": word.RemoveLastNRunes(len(suffix1Runes)) return true case "yendo": // In the case of "yendo", the "yendo" must lie in RV, // and be preceded by a "u" somewhere in the word. for i := 0; i < len(word.RS)-(len(suffix1)+len(suffix2)); i++ { // Note, the unicode code point for "u" is 117. if word.RS[i] == 117 { word.RemoveLastNRunes(len(suffix1Runes)) return true } } } return false }