// Step 3 is the stemming of various longer sufficies // found in R1. // func step3(w *snowballword.SnowballWord) bool { suffix, suffixRunes := w.FirstSuffix( "ational", "tional", "alize", "icate", "ative", "iciti", "ical", "ful", "ness", ) // If it is not in R1, do nothing if suffix == "" || len(suffixRunes) > len(w.RS)-w.R1start { return false } // Handle special cases where we're not just going to // replace the suffix with another suffix: there are // other things we need to do. // if suffix == "ative" { // If in R2, delete. // if len(w.RS)-w.R2start >= 5 { w.RemoveLastNRunes(len(suffixRunes)) return true } return false } // Handle a suffix that was found, which is going // to be replaced with a different suffix. // var repl string switch suffix { case "ational": repl = "ate" case "tional": repl = "tion" case "alize": repl = "al" case "icate", "iciti", "ical": repl = "ic" case "ful", "ness": repl = "" } w.ReplaceSuffixRunes(suffixRunes, []rune(repl), true) return true }
// Step 1a is normalization of various special "s"-endings. // func step1a(w *snowballword.SnowballWord) bool { suffix, suffixRunes := w.FirstSuffix("sses", "ied", "ies", "us", "ss", "s") switch suffix { case "sses": // Replace by ss w.ReplaceSuffixRunes(suffixRunes, []rune("ss"), true) return true case "ies", "ied": // Replace by i if preceded by more than one letter, // otherwise by ie (so ties -> tie, cries -> cri). var repl string if len(w.RS) > 4 { repl = "i" } else { repl = "ie" } w.ReplaceSuffixRunes(suffixRunes, []rune(repl), true) return true case "us", "ss": // Do nothing return false case "s": // Delete if the preceding word part contains a vowel // not immediately before the s (so gas and this retain // the s, gaps and kiwis lose it) // for i := 0; i < len(w.RS)-2; i++ { if isLowerVowel(w.RS[i]) { w.RemoveLastNRunes(len(suffixRunes)) return true } } } return false }
// Step 2 is the stemming of various endings found in // R1 including "al", "ness", and "li". // func step2(w *snowballword.SnowballWord) bool { // Possible sufficies for this step, longest first. suffix, suffixRunes := w.FirstSuffix( "ational", "fulness", "iveness", "ization", "ousness", "biliti", "lessli", "tional", "alism", "aliti", "ation", "entli", "fulli", "iviti", "ousli", "anci", "abli", "alli", "ator", "enci", "izer", "bli", "ogi", "li", ) // If it is not in R1, do nothing if suffix == "" || len(suffixRunes) > len(w.RS)-w.R1start { return false } // Handle special cases where we're not just going to // replace the suffix with another suffix: there are // other things we need to do. // switch suffix { case "li": // Delete if preceded by a valid li-ending. Valid li-endings inlude the // following charaters: cdeghkmnrt. (Note, the unicode code points for // these characters are, respectively, as follows: // 99 100 101 103 104 107 109 110 114 116) // rsLen := len(w.RS) if rsLen >= 3 { switch w.RS[rsLen-3] { case 99, 100, 101, 103, 104, 107, 109, 110, 114, 116: w.RemoveLastNRunes(len(suffixRunes)) return true } } return false case "ogi": // Replace by og if preceded by l. // (Note, the unicode code point for l is 108) // rsLen := len(w.RS) if rsLen >= 4 && w.RS[rsLen-4] == 108 { w.ReplaceSuffixRunes(suffixRunes, []rune("og"), true) } return true } // Handle a suffix that was found, which is going // to be replaced with a different suffix. // var repl string switch suffix { case "tional": repl = "tion" case "enci": repl = "ence" case "anci": repl = "ance" case "abli": repl = "able" case "entli": repl = "ent" case "izer", "ization": repl = "ize" case "ational", "ation", "ator": repl = "ate" case "alism", "aliti", "alli": repl = "al" case "fulness": repl = "ful" case "ousli", "ousness": repl = "ous" case "iveness", "iviti": repl = "ive" case "biliti", "bli": repl = "ble" case "fulli": repl = "ful" case "lessli": repl = "less" } w.ReplaceSuffixRunes(suffixRunes, []rune(repl), true) return true }
// Step 4 is the cleaning up of residual suffixes. // func step4(word *snowballword.SnowballWord) bool { hadChange := false if word.String() == "voudrion" { log.Println("...", word) } // If the word ends s (unicode code point 115), // not preceded by a, i, o, u, è or s, delete it. // if idx := len(word.RS) - 1; idx >= 1 && word.RS[idx] == 115 { switch word.RS[idx-1] { case 97, 105, 111, 117, 232, 115: // Do nothing, preceded by a, i, o, u, è or s return false default: word.RemoveLastNRunes(1) hadChange = true } } // Note: all the following are restricted to the RV region. // Search for the longest among the following suffixes in RV. // suffix, suffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS), "Ière", "ière", "Ier", "ier", "ion", "e", "ë", ) switch suffix { case "": return hadChange case "ion": // Delete if in R2 and preceded by s or t in RV const sLen int = 3 // equivalently, len(suffixRunes) idx := len(word.RS) - sLen - 1 if word.FitsInR2(sLen) && idx >= 0 && word.FitsInRV(sLen+1) { if word.RS[idx] == 115 || word.RS[idx] == 116 { word.RemoveLastNRunes(sLen) return true } } return hadChange case "ier", "ière", "Ier", "Ière": // Replace with i word.ReplaceSuffixRunes(suffixRunes, []rune("i"), true) return true case "e": word.RemoveLastNRunes(1) return true case "ë": // If preceded by gu (unicode code point 103 & 117), delete idx := len(word.RS) - 1 if idx >= 2 && word.RS[idx-2] == 103 && word.RS[idx-1] == 117 { word.RemoveLastNRunes(1) return true } return hadChange } return true }
// Step 1 is the removal of standard suffixes // func step1(word *snowballword.SnowballWord) bool { suffix, suffixRunes := word.FirstSuffix( "issements", "issement", "atrices", "utions", "usions", "logies", "emment", "ements", "atrice", "ations", "ateurs", "amment", "ution", "usion", "ments", "logie", "istes", "ismes", "iqUes", "euses", "ences", "ement", "ation", "ateur", "ances", "ables", "ment", "ités", "iste", "isme", "iqUe", "euse", "ence", "eaux", "ance", "able", "ives", "ité", "eux", "aux", "ive", "ifs", "if", ) if suffix == "" { return false } isInR1 := (word.R1start <= len(word.RS)-len(suffixRunes)) isInR2 := (word.R2start <= len(word.RS)-len(suffixRunes)) isInRV := (word.RVstart <= len(word.RS)-len(suffixRunes)) // Handle simple replacements & deletions in R2 first if isInR2 { // Handle simple replacements in R2 repl := "" switch suffix { case "logie", "logies": repl = "log" case "usion", "ution", "usions", "utions": repl = "u" case "ence", "ences": repl = "ent" } if repl != "" { word.ReplaceSuffixRunes(suffixRunes, []rune(repl), true) return true } // Handle simple deletions in R2 switch suffix { case "ance", "iqUe", "isme", "able", "iste", "eux", "ances", "iqUes", "ismes", "ables", "istes": word.RemoveLastNRunes(len(suffixRunes)) return true } } // Handle simple replacements in RV if isInRV { // NOTE: these are "special" suffixes in that // we must still do steps 2a and 2b of the // French stemmer even when these suffixes are // found in step1. Therefore, we are returning // `false` here. repl := "" switch suffix { case "amment": repl = "ant" case "emment": repl = "ent" } if repl != "" { word.ReplaceSuffixRunes(suffixRunes, []rune(repl), true) return false } // Delete if preceded by a vowel that is also in RV if suffix == "ment" || suffix == "ments" { idx := len(word.RS) - len(suffixRunes) - 1 if idx >= word.RVstart && isLowerVowel(word.RS[idx]) { word.RemoveLastNRunes(len(suffixRunes)) return false } return false } } // Handle all the other "special" cases. All of these // return true immediately after changing the word. // switch suffix { case "eaux": // Replace with eau word.ReplaceSuffixRunes(suffixRunes, []rune("eau"), true) return true case "aux": // Replace with al if in R1 if isInR1 { word.ReplaceSuffixRunes(suffixRunes, []rune("al"), true) return true } case "euse", "euses": // Delete if in R2, else replace by eux if in R1 if isInR2 { word.RemoveLastNRunes(len(suffixRunes)) return true } else if isInR1 { word.ReplaceSuffixRunes(suffixRunes, []rune("eux"), true) return true } case "issement", "issements": // Delete if in R1 and preceded by a non-vowel if isInR1 { idx := len(word.RS) - len(suffixRunes) - 1 if idx >= 0 && isLowerVowel(word.RS[idx]) == false { word.RemoveLastNRunes(len(suffixRunes)) return true } } return false case "atrice", "ateur", "ation", "atrices", "ateurs", "ations": // Delete if in R2 if isInR2 { word.RemoveLastNRunes(len(suffixRunes)) // If preceded by "ic", delete if in R2, else replace by "iqU". newSuffix, newSuffixRunes := word.FirstSuffix("ic") if newSuffix != "" { if word.FitsInR2(len(newSuffixRunes)) { word.RemoveLastNRunes(len(newSuffixRunes)) } else { word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true) } } return true } case "ement", "ements": if isInRV { // Delete if in RV word.RemoveLastNRunes(len(suffixRunes)) // If preceded by "iv", delete if in R2 // (and if further preceded by "at", delete if in R2) newSuffix, newSuffixRunes := word.RemoveFirstSuffixIfIn(word.R2start, "iv") if newSuffix != "" { word.RemoveFirstSuffixIfIn(word.R2start, "at") return true } // If preceded by "eus", delete if in R2, else replace by "eux" if in R1 newSuffix, newSuffixRunes = word.FirstSuffix("eus") if newSuffix != "" { newSuffixLen := len(newSuffixRunes) if word.FitsInR2(newSuffixLen) { word.RemoveLastNRunes(newSuffixLen) } else if word.FitsInR1(newSuffixLen) { word.ReplaceSuffixRunes(newSuffixRunes, []rune("eux"), true) } return true } // If preceded by abl or iqU, delete if in R2, otherwise, newSuffix, newSuffixRunes = word.FirstSuffix("abl", "iqU") if newSuffix != "" { newSuffixLen := len(newSuffixRunes) if word.FitsInR2(newSuffixLen) { word.RemoveLastNRunes(newSuffixLen) } return true } // If preceded by ièr or Ièr, replace by i if in RV newSuffix, newSuffixRunes = word.FirstSuffix("ièr", "Ièr") if newSuffix != "" { if word.FitsInRV(len(newSuffixRunes)) { word.ReplaceSuffixRunes(newSuffixRunes, []rune("i"), true) } return true } return true } case "ité", "ités": if isInR2 { // Delete if in R2 word.RemoveLastNRunes(len(suffixRunes)) // If preceded by "abil", delete if in R2, else replace by "abl" newSuffix, newSuffixRunes := word.FirstSuffix("abil") if newSuffix != "" { newSuffixLen := len(newSuffixRunes) if word.FitsInR2(newSuffixLen) { word.RemoveLastNRunes(newSuffixLen) } else { word.ReplaceSuffixRunes(newSuffixRunes, []rune("abl"), true) } return true } // If preceded by "ic", delete if in R2, else replace by "iqU" newSuffix, newSuffixRunes = word.FirstSuffix("ic") if newSuffix != "" { newSuffixLen := len(newSuffixRunes) if word.FitsInR2(newSuffixLen) { word.RemoveLastNRunes(newSuffixLen) } else { word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true) } return true } // If preceded by "iv", delete if in R2 newSuffix, newSuffixRunes = word.RemoveFirstSuffixIfIn(word.R2start, "iv") return true } case "if", "ive", "ifs", "ives": if isInR2 { // Delete if in R2 word.RemoveLastNRunes(len(suffixRunes)) // If preceded by at, delete if in R2 newSuffix, newSuffixRunes := word.RemoveFirstSuffixIfIn(word.R2start, "at") if newSuffix != "" { // And if further preceded by ic, delete if in R2, else replace by iqU newSuffix, newSuffixRunes = word.FirstSuffix("ic") if newSuffix != "" { newSuffixLen := len(newSuffixRunes) if word.FitsInR2(newSuffixLen) { word.RemoveLastNRunes(newSuffixLen) } else { word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true) } } } return true } } return false }
// Step 0 is the removal of attached pronouns // func step0(word *snowballword.SnowballWord) bool { // Search for the longest among the following suffixes suffix1, suffix1Runes := word.FirstSuffixIn(word.RVstart, len(word.RS), "selas", "selos", "sela", "selo", "las", "les", "los", "nos", "me", "se", "la", "le", "lo", ) // If the suffix empty or not in RV, we have nothing to do. if suffix1 == "" { return false } // We'll remove suffix1, if comes after one of the following suffix2, suffix2Runes := word.FirstSuffixIn(word.RVstart, len(word.RS)-len(suffix1), "iéndo", "iendo", "yendo", "ando", "ándo", "ár", "ér", "ír", "ar", "er", "ir", ) switch suffix2 { case "": // Nothing to do return false case "iéndo", "ándo", "ár", "ér", "ír": // In these cases, deletion is followed by removing // the acute accent (e.g., haciéndola -> haciendo). var suffix2repl string switch suffix2 { case "": return false case "iéndo": suffix2repl = "iendo" case "ándo": suffix2repl = "ando" case "ár": suffix2repl = "ar" case "ír": suffix2repl = "ir" } word.RemoveLastNRunes(len(suffix1Runes)) word.ReplaceSuffixRunes(suffix2Runes, []rune(suffix2repl), true) return true case "ando", "iendo", "ar", "er", "ir": word.RemoveLastNRunes(len(suffix1Runes)) return true case "yendo": // In the case of "yendo", the "yendo" must lie in RV, // and be preceded by a "u" somewhere in the word. for i := 0; i < len(word.RS)-(len(suffix1)+len(suffix2)); i++ { // Note, the unicode code point for "u" is 117. if word.RS[i] == 117 { word.RemoveLastNRunes(len(suffix1Runes)) return true } } } return false }
// Step 1 is the removal of standard suffixes // func step1(word *snowballword.SnowballWord) bool { // Possible suffixes, longest first suffix, suffixRunes := word.FirstSuffix( "amientos", "imientos", "aciones", "amiento", "imiento", "uciones", "logías", "idades", "encias", "ancias", "amente", "adores", "adoras", "ución", "mente", "logía", "istas", "ismos", "ibles", "encia", "anzas", "antes", "ancia", "adora", "ación", "ables", "osos", "osas", "ivos", "ivas", "ista", "ismo", "idad", "icos", "icas", "ible", "anza", "ante", "ador", "able", "oso", "osa", "ivo", "iva", "ico", "ica", ) isInR1 := (word.R1start <= len(word.RS)-len(suffixRunes)) isInR2 := (word.R2start <= len(word.RS)-len(suffixRunes)) // Deal with special cases first. All of these will // return if they are hit. // switch suffix { case "": // Nothing to do return false case "amente": if isInR1 { // Delete if in R1 word.RemoveLastNRunes(len(suffixRunes)) // if preceded by iv, delete if in R2 (and if further preceded by at, // delete if in R2), otherwise, // if preceded by os, ic or ad, delete if in R2 newSuffix, _ := word.RemoveFirstSuffixIfIn(word.R2start, "iv", "os", "ic", "ad") if newSuffix == "iv" { word.RemoveFirstSuffixIfIn(word.R2start, "at") } return true } return false } // All the following cases require the found suffix // to be in R2. if isInR2 == false { return false } // Compound replacement cases. All these cases return // if they are hit. // compoundReplacement := func(otherSuffixes ...string) bool { word.RemoveLastNRunes(len(suffixRunes)) word.RemoveFirstSuffixIfIn(word.R2start, otherSuffixes...) return true } switch suffix { case "adora", "ador", "ación", "adoras", "adores", "aciones", "ante", "antes", "ancia", "ancias": return compoundReplacement("ic") case "mente": return compoundReplacement("ante", "able", "ible") case "idad", "idades": return compoundReplacement("abil", "ic", "iv") case "iva", "ivo", "ivas", "ivos": return compoundReplacement("at") } // Simple replacement & deletion cases are all that remain. // simpleReplacement := func(repl string) bool { word.ReplaceSuffixRunes(suffixRunes, []rune(repl), true) return true } switch suffix { case "logía", "logías": return simpleReplacement("log") case "ución", "uciones": return simpleReplacement("u") case "encia", "encias": return simpleReplacement("ente") case "anza", "anzas", "ico", "ica", "icos", "icas", "ismo", "ismos", "able", "ables", "ible", "ibles", "ista", "istas", "oso", "osa", "osos", "osas", "amiento", "amientos", "imiento", "imientos": word.RemoveLastNRunes(len(suffixRunes)) return true } log.Panicln("Unhandled suffix:", suffix) return false }
// Step 1b is the normalization of various "ly" and "ed" sufficies. // func step1b(w *snowballword.SnowballWord) bool { suffix, suffixRunes := w.FirstSuffix("eedly", "ingly", "edly", "ing", "eed", "ed") switch suffix { case "": // No suffix found return false case "eed", "eedly": // Replace by ee if in R1 if len(suffixRunes) <= len(w.RS)-w.R1start { w.ReplaceSuffixRunes(suffixRunes, []rune("ee"), true) } return true case "ed", "edly", "ing", "ingly": hasLowerVowel := false for i := 0; i < len(w.RS)-len(suffixRunes); i++ { if isLowerVowel(w.RS[i]) { hasLowerVowel = true break } } if hasLowerVowel { // This case requires a two-step transformation and, due // to the way we've implemented the `ReplaceSuffix` method // here, information about R1 and R2 would be lost between // the two. Therefore, we need to keep track of the // original R1 & R2, so that we may set them below, at the // end of this case. // originalR1start := w.R1start originalR2start := w.R2start // Delete if the preceding word part contains a vowel w.RemoveLastNRunes(len(suffixRunes)) // ...and after the deletion... newSuffix, newSuffixRunes := w.FirstSuffix("at", "bl", "iz", "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt") switch newSuffix { case "": // If the word is short, add "e" if isShortWord(w) { // By definition, r1 and r2 are the empty string for // short words. w.RS = append(w.RS, []rune("e")...) w.R1start = len(w.RS) w.R2start = len(w.RS) return true } case "at", "bl", "iz": // If the word ends "at", "bl" or "iz" add "e" w.ReplaceSuffixRunes(newSuffixRunes, []rune(newSuffix+"e"), true) case "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt": // If the word ends with a double remove the last letter. // Note that, "double" does not include all possible doubles, // just those shown above. // w.RemoveLastNRunes(1) } // Because we did a double replacement, we need to fix // R1 and R2 manually. This is just becase of how we've // implemented the `ReplaceSuffix` method. // rsLen := len(w.RS) if originalR1start < rsLen { w.R1start = originalR1start } else { w.R1start = rsLen } if originalR2start < rsLen { w.R2start = originalR2start } else { w.R2start = rsLen } return true } } return false }