Exemple #1
0
// Remove verb endings and return true if one was removed.
//
func removeVerbEnding(word *snowballword.SnowballWord) bool {
	suffix, suffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS),
		"уйте", "ейте", "ыть", "ыло", "ыли", "ыла", "уют", "ует",
		"нно", "йте", "ишь", "ить", "ите", "ило", "или", "ила",
		"ешь", "ете", "ены", "ено", "ена", "ят", "ют", "ыт", "ым",
		"ыл", "ую", "уй", "ть", "ны", "но", "на", "ло", "ли", "ла",
		"ит", "им", "ил", "ет", "ен", "ем", "ей", "ю", "н", "л", "й",
	)
	switch suffix {
	case "ла", "на", "ете", "йте", "ли", "й", "л", "ем", "н",
		"ло", "но", "ет", "ют", "ны", "ть", "ешь", "нно":

		// These are "Group 1" verb endings.
		// Group 1 endings must follow а (a) or я (ia) in RV.
		if precededByARinRV(word, len(suffixRunes)) == false {
			suffix = ""
		}

	}

	if suffix != "" {
		word.RemoveLastNRunes(len(suffixRunes))
		return true
	}
	return false
}
Exemple #2
0
// Find the starting point of the regions R1, R2, & RV
//
func findRegions(word *snowballword.SnowballWord) (r1start, r2start, rvstart int) {

	// R1 & R2 are defined in the standard manner.
	r1start = romance.VnvSuffix(word, isLowerVowel, 0)
	r2start = romance.VnvSuffix(word, isLowerVowel, r1start)

	// Set RV, by default, as empty.
	rvstart = len(word.RS)

	// Handle the three special cases: "par", "col", & "tap"
	//
	prefix, prefixRunes := word.FirstPrefix("par", "col", "tap")
	if prefix != "" {
		rvstart = len(prefixRunes)
		return
	}

	// If the word begins with two vowels, RV is the region after the third letter
	if len(word.RS) >= 3 && isLowerVowel(word.RS[0]) && isLowerVowel(word.RS[1]) {
		rvstart = 3
		return
	}

	// Otherwise the region after the first vowel not at the beginning of the word.
	for i := 1; i < len(word.RS); i++ {
		if isLowerVowel(word.RS[i]) {
			rvstart = i + 1
			return
		}
	}

	return
}
Exemple #3
0
// Step 2 is the removal of the "и" suffix.
//
func step2(word *snowballword.SnowballWord) bool {
	suffix, _ := word.RemoveFirstSuffixIn(word.RVstart, "и")
	if suffix != "" {
		return true
	}
	return false
}
Exemple #4
0
// Step 5 Undouble non-vowel endings
//
func step5(word *snowballword.SnowballWord) bool {

	suffix, _ := word.FirstSuffix("enn", "onn", "ett", "ell", "eill")
	if suffix != "" {
		word.RemoveLastNRunes(1)
	}
	return false
}
Exemple #5
0
func preprocess(word *snowballword.SnowballWord) {

	r1start, r2start, rvstart := findRegions(word)
	word.R1start = r1start
	word.R2start = r2start
	word.RVstart = rvstart

}
Exemple #6
0
// Step 0 is to strip off apostrophes and "s".
//
func step0(w *snowballword.SnowballWord) bool {
	suffix, suffixRunes := w.FirstSuffix("'s'", "'s", "'")
	if suffix == "" {
		return false
	}
	w.RemoveLastNRunes(len(suffixRunes))
	return true
}
Exemple #7
0
// Step 2a is the removal of verb suffixes beginning y,
// Search for the longest among the following suffixes
// in RV, and if found, delete if preceded by u.
//
func step2a(word *snowballword.SnowballWord) bool {
	suffix, suffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS), "ya", "ye", "yan", "yen", "yeron", "yendo", "yo", "yó", "yas", "yes", "yais", "yamos")
	if suffix != "" {
		idx := len(word.RS) - len(suffixRunes) - 1
		if idx >= 0 && word.RS[idx] == 117 {
			word.RemoveLastNRunes(len(suffixRunes))
			return true
		}
	}
	return false
}
Exemple #8
0
// Step 3 is the removal of the derivational suffix.
//
func step3(word *snowballword.SnowballWord) bool {

	// Search for a DERIVATIONAL ending in R2 (i.e. the entire
	// ending must lie in R2), and if one is found, remove it.

	suffix, _ := word.RemoveFirstSuffixIn(word.R2start, "ост", "ость")
	if suffix != "" {
		return true
	}
	return false
}
Exemple #9
0
// Find the starting point of the two regions R1 & R2.
//
// R1 is the region after the first non-vowel following a vowel,
// or is the null region at the end of the word if there is no
// such non-vowel.
//
// R2 is the region after the first non-vowel following a vowel
// in R1, or is the null region at the end of the word if there
// is no such non-vowel.
//
// See http://snowball.tartarus.org/texts/r1r2.html
//
func r1r2(word *snowballword.SnowballWord) (r1start, r2start int) {

	specialPrefix, _ := word.FirstPrefix("gener", "commun", "arsen")

	if specialPrefix != "" {
		r1start = len(specialPrefix)
	} else {
		r1start = romance.VnvSuffix(word, isLowerVowel, 0)
	}
	r2start = romance.VnvSuffix(word, isLowerVowel, r1start)
	return
}
Exemple #10
0
// Step 3 is the removal of residual suffixes.
//
func step3(word *snowballword.SnowballWord) bool {
	suffix, suffixRunes := word.FirstSuffixIfIn(word.RVstart, len(word.RS),
		"os", "a", "o", "á", "í", "ó", "e", "é",
	)

	// No suffix found, nothing to do.
	//
	if suffix == "" {
		return false
	}

	// Remove all these suffixes
	word.RemoveLastNRunes(len(suffixRunes))

	if suffix == "e" || suffix == "é" {

		// If preceded by gu with the u in RV delete the u
		//
		guSuffix, _ := word.FirstSuffix("gu")
		if guSuffix != "" {
			word.RemoveLastNRunes(1)
		}
	}
	return true
}
Exemple #11
0
// Applies various transformations necessary for the
// other, subsequent stemming steps.  Most important
// of which is defining the two regions R1 & R2.
//
func preprocess(word *snowballword.SnowballWord) {

	// Clean up apostrophes
	normalizeApostrophes(word)
	trimLeftApostrophes(word)

	// Capitalize Y's that are not behaving
	// as vowels.
	capitalizeYs(word)

	// Find the two regions, R1 & R2
	r1start, r2start := r1r2(word)
	word.R1start = r1start
	word.R2start = r2start
}
Exemple #12
0
// Trim off leading apostropes.  (Slight variation from
// NLTK implementation here, in which only the first is removed.)
//
func trimLeftApostrophes(word *snowballword.SnowballWord) {
	var (
		numApostrophes int
		r              rune
	)

	for numApostrophes, r = range word.RS {

		// Check for "'", which is unicode code point 39
		if r != 39 {
			break
		}
	}
	if numApostrophes > 0 {
		word.RS = word.RS[numApostrophes:]
		word.R1start = word.R1start - numApostrophes
		word.R2start = word.R2start - numApostrophes
	}
}
Exemple #13
0
// Remove perfective gerund endings and return true if one was removed.
//
func removePerfectiveGerundEnding(word *snowballword.SnowballWord) bool {
	suffix, suffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS),
		"ившись", "ывшись", "вшись", "ивши", "ывши", "вши", "ив", "ыв", "в",
	)
	switch suffix {
	case "в", "вши", "вшись":

		// These are "Group 1" perfective gerund endings.
		// Group 1 endings must follow а (a) or я (ia) in RV.
		if precededByARinRV(word, len(suffixRunes)) == false {
			suffix = ""
		}

	}

	if suffix != "" {
		word.RemoveLastNRunes(len(suffixRunes))
		return true
	}
	return false
}
Exemple #14
0
// Remove adjectival endings and return true if one was removed.
//
func removeAdjectivalEnding(word *snowballword.SnowballWord) bool {

	// Remove adjectival endings.  Start by looking for
	// an adjective ending.
	//
	suffix, _ := word.RemoveFirstSuffixIn(word.RVstart,
		"ими", "ыми", "его", "ого", "ему", "ому", "ее", "ие",
		"ые", "ое", "ей", "ий", "ый", "ой", "ем", "им", "ым",
		"ом", "их", "ых", "ую", "юю", "ая", "яя", "ою", "ею",
	)
	if suffix != "" {

		// We found an adjective ending.  Remove optional participle endings.
		//
		newSuffix, newSuffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS),
			"ивш", "ывш", "ующ",
			"ем", "нн", "вш", "ющ", "щ",
		)
		switch newSuffix {
		case "ем", "нн", "вш", "ющ", "щ":

			// These are "Group 1" participle endings.
			// Group 1 endings must follow а (a) or я (ia) in RV.
			if precededByARinRV(word, len(newSuffixRunes)) == false {
				newSuffix = ""
			}
		}

		if newSuffix != "" {
			word.RemoveLastNRunes(len(newSuffixRunes))
		}
		return true
	}
	return false
}
Exemple #15
0
// Step 1 is the removal of standard suffixes, all of which must
// occur in RV.
//
//
// Search for a PERFECTIVE GERUND ending. If one is found remove it, and
// that is then the end of step 1. Otherwise try and remove a REFLEXIVE
// ending, and then search in turn for (1) an ADJECTIVAL, (2) a VERB or
// (3) a NOUN ending. As soon as one of the endings (1) to (3) is found
// remove it, and terminate step 1.
//
func step1(word *snowballword.SnowballWord) bool {

	// `stop` will be used to signal early termination
	var stop bool

	// Search for a PERFECTIVE GERUND ending
	stop = removePerfectiveGerundEnding(word)
	if stop {
		return true
	}

	// Next remove reflexive endings
	word.RemoveFirstSuffixIn(word.RVstart, "ся", "сь")

	// Next remove adjectival endings
	stop = removeAdjectivalEnding(word)
	if stop {
		return true
	}

	// Next remove verb endings
	stop = removeVerbEnding(word)
	if stop {
		return true
	}

	// Next remove noun endings
	suffix, _ := word.RemoveFirstSuffixIn(word.RVstart,
		"иями", "ями", "иях", "иям", "ием", "ией", "ами", "ях",
		"ям", "ья", "ью", "ье", "ом", "ой", "ов", "ия", "ию",
		"ий", "ии", "ие", "ем", "ей", "еи", "ев", "ах", "ам",
		"я", "ю", "ь", "ы", "у", "о", "й", "и", "е", "а",
	)
	if suffix != "" {
		return true
	}

	return false
}
Exemple #16
0
// Step 5 is the stemming of "e" and "l" sufficies
// found in R2.
//
func step5(w *snowballword.SnowballWord) bool {

	// Last rune index = `lri`
	lri := len(w.RS) - 1

	// If R1 is emtpy, R2 is also empty, and we
	// need not do anything in step 5.
	//
	if w.R1start > lri {
		return false
	}

	if w.RS[lri] == 101 {

		// The word ends with "e", which is unicode code point 101.

		// Delete "e" suffix if in R2, or in R1 and not preceded
		// by a short syllable.
		if w.R2start <= lri || !endsShortSyllable(w, lri) {
			w.ReplaceSuffix("e", "", true)
			return true
		}
		return false

	} else if w.R2start <= lri && w.RS[lri] == 108 && lri-1 >= 0 && w.RS[lri-1] == 108 {

		// The word ends in double "l", and the final "l" is
		// in R2. (Note, the unicode code point for "l" is 108.)

		// Delete the second "l".
		w.ReplaceSuffix("l", "", true)
		return true

	}
	return false
}
Exemple #17
0
// Step 2b is the removal of verb suffixes beginning y,
// Search for the longest among the following suffixes
// in RV, and if found, delete if preceded by u.
//
func step2b(word *snowballword.SnowballWord) bool {
	suffix, suffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS),
		"iésemos", "iéramos", "iríamos", "eríamos", "aríamos", "ásemos",
		"áramos", "ábamos", "isteis", "iríais", "iremos", "ieseis",
		"ierais", "eríais", "eremos", "asteis", "aríais", "aremos",
		"íamos", "irías", "irían", "iréis", "ieses", "iesen", "ieron",
		"ieras", "ieran", "iendo", "erías", "erían", "eréis", "aseis",
		"arías", "arían", "aréis", "arais", "abais", "íais", "iste",
		"iría", "irás", "irán", "imos", "iese", "iera", "idos", "idas",
		"ería", "erás", "erán", "aste", "ases", "asen", "aría", "arás",
		"arán", "aron", "aras", "aran", "ando", "amos", "ados", "adas",
		"abas", "aban", "ías", "ían", "éis", "áis", "iré", "irá", "ido",
		"ida", "eré", "erá", "emos", "ase", "aré", "ará", "ara", "ado",
		"ada", "aba", "ís", "ía", "ió", "ir", "id", "es", "er", "en",
		"ed", "as", "ar", "an", "ad",
	)
	switch suffix {
	case "":
		return false

	case "en", "es", "éis", "emos":

		// Delete, and if preceded by gu delete the u (the gu need not be in RV)
		word.RemoveLastNRunes(len(suffixRunes))
		guSuffix, _ := word.FirstSuffix("gu")
		if guSuffix != "" {
			word.RemoveLastNRunes(1)
		}

	default:

		// Delete
		word.RemoveLastNRunes(len(suffixRunes))
	}
	return true
}
Exemple #18
0
// Step 1a is normalization of various special "s"-endings.
//
func step1a(w *snowballword.SnowballWord) bool {

	suffix, suffixRunes := w.FirstSuffix("sses", "ied", "ies", "us", "ss", "s")
	switch suffix {

	case "sses":

		// Replace by ss
		w.ReplaceSuffixRunes(suffixRunes, []rune("ss"), true)
		return true

	case "ies", "ied":

		// Replace by i if preceded by more than one letter,
		// otherwise by ie (so ties -> tie, cries -> cri).

		var repl string
		if len(w.RS) > 4 {
			repl = "i"
		} else {
			repl = "ie"
		}
		w.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
		return true

	case "us", "ss":

		// Do nothing
		return false

	case "s":

		// Delete if the preceding word part contains a vowel
		// not immediately before the s (so gas and this retain
		// the s, gaps and kiwis lose it)
		//
		for i := 0; i < len(w.RS)-2; i++ {
			if isLowerVowel(w.RS[i]) {
				w.RemoveLastNRunes(len(suffixRunes))
				return true
			}
		}
	}
	return false
}
Exemple #19
0
// Step 3 is the stemming of various longer sufficies
// found in R1.
//
func step3(w *snowballword.SnowballWord) bool {

	suffix, suffixRunes := w.FirstSuffix(
		"ational", "tional", "alize", "icate", "ative",
		"iciti", "ical", "ful", "ness",
	)

	// If it is not in R1, do nothing
	if suffix == "" || len(suffixRunes) > len(w.RS)-w.R1start {
		return false
	}

	// Handle special cases where we're not just going to
	// replace the suffix with another suffix: there are
	// other things we need to do.
	//
	if suffix == "ative" {

		// If in R2, delete.
		//
		if len(w.RS)-w.R2start >= 5 {
			w.RemoveLastNRunes(len(suffixRunes))
			return true
		}
		return false
	}

	// Handle a suffix that was found, which is going
	// to be replaced with a different suffix.
	//
	var repl string
	switch suffix {
	case "ational":
		repl = "ate"
	case "tional":
		repl = "tion"
	case "alize":
		repl = "al"
	case "icate", "iciti", "ical":
		repl = "ic"
	case "ful", "ness":
		repl = ""
	}
	w.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
	return true

}
Exemple #20
0
// al, ance, ence, er, ic, able, ible, ant, ement, ment,
// ent, ism, ate, iti, ous, ive, ize
// delete
//
// ion
// delete if preceded by s or t
func step4(w *snowballword.SnowballWord) bool {

	// Find all endings in R1
	suffix, suffixRunes := w.FirstSuffix(
		"ement", "ance", "ence", "able", "ible", "ment",
		"ent", "ant", "ism", "ate", "iti", "ous", "ive",
		"ize", "ion", "al", "er", "ic",
	)

	// If it does not fit in R2, do nothing.
	if len(suffixRunes) > len(w.RS)-w.R2start {
		return false
	}

	// Handle special cases
	switch suffix {
	case "":
		return false

	case "ion":
		// Replace by og if preceded by l
		// l = 108
		rsLen := len(w.RS)
		if rsLen >= 4 {
			switch w.RS[rsLen-4] {
			case 115, 116:
				w.RemoveLastNRunes(len(suffixRunes))
				return true
			}

		}
		return false
	}

	// Handle basic replacements
	w.RemoveLastNRunes(len(suffixRunes))
	return true

}
Exemple #21
0
// Step 2a is the removal of Verb suffixes beginning
// with "i" in the RV region.
//
func step2a(word *snowballword.SnowballWord) bool {

	// Search for the longest among the following suffixes
	// in RV and if found, delete if preceded by a non-vowel.

	suffix, suffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS),
		"issantes", "issaIent", "issions", "issants", "issante",
		"iraIent", "issons", "issiez", "issent", "issant", "issait",
		"issais", "irions", "issez", "isses", "iront", "irons", "iriez",
		"irent", "irait", "irais", "îtes", "îmes", "isse", "irez",
		"iras", "irai", "ira", "ies", "ît", "it", "is", "ir", "ie", "i",
	)
	if suffix != "" {
		sLen := len(suffixRunes)
		idx := len(word.RS) - sLen - 1
		if idx >= 0 && word.FitsInRV(sLen+1) && isLowerVowel(word.RS[idx]) == false {
			word.RemoveLastNRunes(len(suffixRunes))
			return true
		}
	}
	return false
}
Exemple #22
0
// Step 1 is the removal of standard suffixes
//
func step1(word *snowballword.SnowballWord) bool {

	// Possible suffixes, longest first
	suffix, suffixRunes := word.FirstSuffix(
		"amientos", "imientos", "aciones", "amiento", "imiento",
		"uciones", "logías", "idades", "encias", "ancias", "amente",
		"adores", "adoras", "ución", "mente", "logía", "istas",
		"ismos", "ibles", "encia", "anzas", "antes", "ancia",
		"adora", "ación", "ables", "osos", "osas", "ivos", "ivas",
		"ista", "ismo", "idad", "icos", "icas", "ible", "anza",
		"ante", "ador", "able", "oso", "osa", "ivo", "iva",
		"ico", "ica",
	)

	isInR1 := (word.R1start <= len(word.RS)-len(suffixRunes))
	isInR2 := (word.R2start <= len(word.RS)-len(suffixRunes))

	// Deal with special cases first.  All of these will
	// return if they are hit.
	//
	switch suffix {
	case "":

		// Nothing to do
		return false

	case "amente":

		if isInR1 {
			// Delete if in R1
			word.RemoveLastNRunes(len(suffixRunes))

			// if preceded by iv, delete if in R2 (and if further preceded by at,
			// delete if in R2), otherwise,
			// if preceded by os, ic or ad, delete if in R2
			newSuffix, _ := word.RemoveFirstSuffixIfIn(word.R2start, "iv", "os", "ic", "ad")
			if newSuffix == "iv" {
				word.RemoveFirstSuffixIfIn(word.R2start, "at")
			}
			return true
		}
		return false
	}

	// All the following cases require the found suffix
	// to be in R2.
	if isInR2 == false {
		return false
	}

	// Compound replacement cases.  All these cases return
	// if they are hit.
	//
	compoundReplacement := func(otherSuffixes ...string) bool {
		word.RemoveLastNRunes(len(suffixRunes))
		word.RemoveFirstSuffixIfIn(word.R2start, otherSuffixes...)
		return true
	}

	switch suffix {
	case "adora", "ador", "ación", "adoras", "adores", "aciones", "ante", "antes", "ancia", "ancias":
		return compoundReplacement("ic")
	case "mente":
		return compoundReplacement("ante", "able", "ible")
	case "idad", "idades":
		return compoundReplacement("abil", "ic", "iv")
	case "iva", "ivo", "ivas", "ivos":
		return compoundReplacement("at")
	}

	// Simple replacement & deletion cases are all that remain.
	//
	simpleReplacement := func(repl string) bool {
		word.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
		return true
	}
	switch suffix {
	case "logía", "logías":
		return simpleReplacement("log")
	case "ución", "uciones":
		return simpleReplacement("u")
	case "encia", "encias":
		return simpleReplacement("ente")
	case "anza", "anzas", "ico", "ica", "icos", "icas",
		"ismo", "ismos", "able", "ables", "ible", "ibles",
		"ista", "istas", "oso", "osa", "osos", "osas",
		"amiento", "amientos", "imiento", "imientos":
		word.RemoveLastNRunes(len(suffixRunes))
		return true
	}

	log.Panicln("Unhandled suffix:", suffix)
	return false
}
Exemple #23
0
// Step 1b is the normalization of various "ly" and "ed" sufficies.
//
func step1b(w *snowballword.SnowballWord) bool {

	suffix, suffixRunes := w.FirstSuffix("eedly", "ingly", "edly", "ing", "eed", "ed")

	switch suffix {

	case "":
		// No suffix found
		return false

	case "eed", "eedly":

		// Replace by ee if in R1
		if len(suffixRunes) <= len(w.RS)-w.R1start {
			w.ReplaceSuffixRunes(suffixRunes, []rune("ee"), true)
		}
		return true

	case "ed", "edly", "ing", "ingly":
		hasLowerVowel := false
		for i := 0; i < len(w.RS)-len(suffixRunes); i++ {
			if isLowerVowel(w.RS[i]) {
				hasLowerVowel = true
				break
			}
		}
		if hasLowerVowel {

			// This case requires a two-step transformation and, due
			// to the way we've implemented the `ReplaceSuffix` method
			// here, information about R1 and R2 would be lost between
			// the two.  Therefore, we need to keep track of the
			// original R1 & R2, so that we may set them below, at the
			// end of this case.
			//
			originalR1start := w.R1start
			originalR2start := w.R2start

			// Delete if the preceding word part contains a vowel
			w.RemoveLastNRunes(len(suffixRunes))

			// ...and after the deletion...

			newSuffix, newSuffixRunes := w.FirstSuffix("at", "bl", "iz", "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt")
			switch newSuffix {

			case "":

				// If the word is short, add "e"
				if isShortWord(w) {

					// By definition, r1 and r2 are the empty string for
					// short words.
					w.RS = append(w.RS, []rune("e")...)
					w.R1start = len(w.RS)
					w.R2start = len(w.RS)
					return true
				}

			case "at", "bl", "iz":

				// If the word ends "at", "bl" or "iz" add "e"
				w.ReplaceSuffixRunes(newSuffixRunes, []rune(newSuffix+"e"), true)

			case "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt":

				// If the word ends with a double remove the last letter.
				// Note that, "double" does not include all possible doubles,
				// just those shown above.
				//
				w.RemoveLastNRunes(1)
			}

			// Because we did a double replacement, we need to fix
			// R1 and R2 manually. This is just becase of how we've
			// implemented the `ReplaceSuffix` method.
			//
			rsLen := len(w.RS)
			if originalR1start < rsLen {
				w.R1start = originalR1start
			} else {
				w.R1start = rsLen
			}
			if originalR2start < rsLen {
				w.R2start = originalR2start
			} else {
				w.R2start = rsLen
			}

			return true
		}

	}

	return false
}
Exemple #24
0
// Step 4 is the undoubling of double non-vowel endings
// and removal of superlative endings.
//
func step4(word *snowballword.SnowballWord) bool {

	// (1) Undouble "н", or, 2) if the word ends with a SUPERLATIVE ending,
	// (remove it and undouble н n), or 3) if the word ends ь (') (soft sign)
	// remove it.

	// Undouble "н"
	if word.HasSuffixRunes([]rune("нн")) {
		word.RemoveLastNRunes(1)
		return true
	}

	// Remove superlative endings
	suffix, _ := word.RemoveFirstSuffix("ейше", "ейш")
	if suffix != "" {
		// Undouble "н"
		if word.HasSuffixRunes([]rune("нн")) {
			word.RemoveLastNRunes(1)
		}
		return true
	}

	// Remove soft sign
	if rsLen := len(word.RS); rsLen > 0 && word.RS[rsLen-1] == 'ь' {
		word.RemoveLastNRunes(1)
		return true
	}
	return false
}
Exemple #25
0
// Step 2 is the stemming of various endings found in
// R1 including "al", "ness", and "li".
//
func step2(w *snowballword.SnowballWord) bool {

	// Possible sufficies for this step, longest first.
	suffix, suffixRunes := w.FirstSuffix(
		"ational", "fulness", "iveness", "ization", "ousness",
		"biliti", "lessli", "tional", "alism", "aliti", "ation",
		"entli", "fulli", "iviti", "ousli", "anci", "abli",
		"alli", "ator", "enci", "izer", "bli", "ogi", "li",
	)

	// If it is not in R1, do nothing
	if suffix == "" || len(suffixRunes) > len(w.RS)-w.R1start {
		return false
	}

	// Handle special cases where we're not just going to
	// replace the suffix with another suffix: there are
	// other things we need to do.
	//
	switch suffix {

	case "li":

		// Delete if preceded by a valid li-ending. Valid li-endings inlude the
		// following charaters: cdeghkmnrt. (Note, the unicode code points for
		// these characters are, respectively, as follows:
		// 99 100 101 103 104 107 109 110 114 116)
		//
		rsLen := len(w.RS)
		if rsLen >= 3 {
			switch w.RS[rsLen-3] {
			case 99, 100, 101, 103, 104, 107, 109, 110, 114, 116:
				w.RemoveLastNRunes(len(suffixRunes))
				return true
			}
		}
		return false

	case "ogi":

		// Replace by og if preceded by l.
		// (Note, the unicode code point for l is 108)
		//
		rsLen := len(w.RS)
		if rsLen >= 4 && w.RS[rsLen-4] == 108 {
			w.ReplaceSuffixRunes(suffixRunes, []rune("og"), true)
		}
		return true
	}

	// Handle a suffix that was found, which is going
	// to be replaced with a different suffix.
	//
	var repl string
	switch suffix {
	case "tional":
		repl = "tion"
	case "enci":
		repl = "ence"
	case "anci":
		repl = "ance"
	case "abli":
		repl = "able"
	case "entli":
		repl = "ent"
	case "izer", "ization":
		repl = "ize"
	case "ational", "ation", "ator":
		repl = "ate"
	case "alism", "aliti", "alli":
		repl = "al"
	case "fulness":
		repl = "ful"
	case "ousli", "ousness":
		repl = "ous"
	case "iveness", "iviti":
		repl = "ive"
	case "biliti", "bli":
		repl = "ble"
	case "fulli":
		repl = "ful"
	case "lessli":
		repl = "less"
	}
	w.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
	return true

}
Exemple #26
0
// Step 2b is the removal of Verb suffixes in RV
// that do not begin with "i".
//
func step2b(word *snowballword.SnowballWord) bool {

	// Search for the longest among the following suffixes in RV.
	//
	suffix, suffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS),
		"eraIent", "assions", "erions", "assiez", "assent",
		"èrent", "eront", "erons", "eriez", "erait", "erais",
		"asses", "antes", "aIent", "âtes", "âmes", "ions",
		"erez", "eras", "erai", "asse", "ants", "ante", "ées",
		"iez", "era", "ant", "ait", "ais", "és", "ée", "ât",
		"ez", "er", "as", "ai", "é", "a",
	)

	switch suffix {
	case "ions":

		// Delete if in R2
		suffixLen := len(suffixRunes)
		if word.FitsInR2(suffixLen) {
			word.RemoveLastNRunes(suffixLen)
			return true
		}
		return false

	case "é", "ée", "ées", "és", "èrent", "er", "era",
		"erai", "eraIent", "erais", "erait", "eras", "erez",
		"eriez", "erions", "erons", "eront", "ez", "iez":

		// Delete
		word.RemoveLastNRunes(len(suffixRunes))
		return true

	case "âmes", "ât", "âtes", "a", "ai", "aIent",
		"ais", "ait", "ant", "ante", "antes", "ants", "as",
		"asse", "assent", "asses", "assiez", "assions":

		// Delete
		word.RemoveLastNRunes(len(suffixRunes))

		// If preceded by e (unicode code point 101), delete
		//
		idx := len(word.RS) - 1
		if idx >= 0 && word.RS[idx] == 101 && word.FitsInRV(1) {
			word.RemoveLastNRunes(1)
		}
		return true

	}
	return false
}
Exemple #27
0
func printDebug(debug bool, w *snowballword.SnowballWord) {
	if debug {
		log.Println(w.DebugString())
	}
}
Exemple #28
0
// Step 4 is the cleaning up of residual suffixes.
//
func step4(word *snowballword.SnowballWord) bool {

	hadChange := false

	if word.String() == "voudrion" {
		log.Println("...", word)
	}

	// If the word ends s (unicode code point 115),
	// not preceded by a, i, o, u, è or s, delete it.
	//
	if idx := len(word.RS) - 1; idx >= 1 && word.RS[idx] == 115 {
		switch word.RS[idx-1] {

		case 97, 105, 111, 117, 232, 115:

			// Do nothing, preceded by a, i, o, u, è or s
			return false

		default:
			word.RemoveLastNRunes(1)
			hadChange = true

		}
	}

	// Note: all the following are restricted to the RV region.

	// Search for the longest among the following suffixes in RV.
	//
	suffix, suffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS),
		"Ière", "ière", "Ier", "ier", "ion", "e", "ë",
	)

	switch suffix {
	case "":
		return hadChange
	case "ion":

		// Delete if in R2 and preceded by s or t in RV

		const sLen int = 3 // equivalently, len(suffixRunes)
		idx := len(word.RS) - sLen - 1
		if word.FitsInR2(sLen) && idx >= 0 && word.FitsInRV(sLen+1) {
			if word.RS[idx] == 115 || word.RS[idx] == 116 {
				word.RemoveLastNRunes(sLen)
				return true
			}
		}
		return hadChange

	case "ier", "ière", "Ier", "Ière":
		// Replace with i
		word.ReplaceSuffixRunes(suffixRunes, []rune("i"), true)
		return true

	case "e":
		word.RemoveLastNRunes(1)
		return true

	case "ë":

		// If preceded by gu (unicode code point 103 & 117), delete
		idx := len(word.RS) - 1
		if idx >= 2 && word.RS[idx-2] == 103 && word.RS[idx-1] == 117 {
			word.RemoveLastNRunes(1)
			return true
		}
		return hadChange
	}

	return true
}
Exemple #29
0
// Step 1 is the removal of standard suffixes
//
func step1(word *snowballword.SnowballWord) bool {
	suffix, suffixRunes := word.FirstSuffix(
		"issements", "issement", "atrices", "utions", "usions", "logies",
		"emment", "ements", "atrice", "ations", "ateurs", "amment", "ution",
		"usion", "ments", "logie", "istes", "ismes", "iqUes", "euses",
		"ences", "ement", "ation", "ateur", "ances", "ables", "ment",
		"ités", "iste", "isme", "iqUe", "euse", "ence", "eaux", "ance",
		"able", "ives", "ité", "eux", "aux", "ive", "ifs", "if",
	)

	if suffix == "" {
		return false
	}

	isInR1 := (word.R1start <= len(word.RS)-len(suffixRunes))
	isInR2 := (word.R2start <= len(word.RS)-len(suffixRunes))
	isInRV := (word.RVstart <= len(word.RS)-len(suffixRunes))

	// Handle simple replacements & deletions in R2 first
	if isInR2 {

		// Handle simple replacements in R2
		repl := ""
		switch suffix {
		case "logie", "logies":
			repl = "log"
		case "usion", "ution", "usions", "utions":
			repl = "u"
		case "ence", "ences":
			repl = "ent"
		}
		if repl != "" {
			word.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
			return true
		}

		// Handle simple deletions in R2
		switch suffix {
		case "ance", "iqUe", "isme", "able", "iste", "eux", "ances", "iqUes", "ismes", "ables", "istes":
			word.RemoveLastNRunes(len(suffixRunes))
			return true
		}
	}

	// Handle simple replacements in RV
	if isInRV {

		// NOTE: these are "special" suffixes in that
		// we must still do steps 2a and 2b of the
		// French stemmer even when these suffixes are
		// found in step1.  Therefore, we are returning
		// `false` here.

		repl := ""
		switch suffix {
		case "amment":
			repl = "ant"
		case "emment":
			repl = "ent"
		}
		if repl != "" {
			word.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
			return false
		}

		// Delete if preceded by a vowel that is also in RV
		if suffix == "ment" || suffix == "ments" {
			idx := len(word.RS) - len(suffixRunes) - 1
			if idx >= word.RVstart && isLowerVowel(word.RS[idx]) {
				word.RemoveLastNRunes(len(suffixRunes))
				return false
			}
			return false
		}
	}

	// Handle all the other "special" cases.  All of these
	// return true immediately after changing the word.
	//
	switch suffix {
	case "eaux":

		// Replace with eau
		word.ReplaceSuffixRunes(suffixRunes, []rune("eau"), true)
		return true

	case "aux":

		// Replace with al if in R1
		if isInR1 {
			word.ReplaceSuffixRunes(suffixRunes, []rune("al"), true)
			return true
		}

	case "euse", "euses":

		// Delete if in R2, else replace by eux if in R1
		if isInR2 {
			word.RemoveLastNRunes(len(suffixRunes))
			return true
		} else if isInR1 {
			word.ReplaceSuffixRunes(suffixRunes, []rune("eux"), true)
			return true
		}

	case "issement", "issements":

		// Delete if in R1 and preceded by a non-vowel
		if isInR1 {
			idx := len(word.RS) - len(suffixRunes) - 1
			if idx >= 0 && isLowerVowel(word.RS[idx]) == false {
				word.RemoveLastNRunes(len(suffixRunes))
				return true
			}
		}
		return false

	case "atrice", "ateur", "ation", "atrices", "ateurs", "ations":

		// Delete if in R2
		if isInR2 {
			word.RemoveLastNRunes(len(suffixRunes))

			// If preceded by "ic", delete if in R2, else replace by "iqU".
			newSuffix, newSuffixRunes := word.FirstSuffix("ic")
			if newSuffix != "" {
				if word.FitsInR2(len(newSuffixRunes)) {
					word.RemoveLastNRunes(len(newSuffixRunes))
				} else {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true)
				}
			}
			return true
		}

	case "ement", "ements":

		if isInRV {

			// Delete if in RV
			word.RemoveLastNRunes(len(suffixRunes))

			// If preceded by "iv", delete if in R2
			// (and if further preceded by "at", delete if in R2)
			newSuffix, newSuffixRunes := word.RemoveFirstSuffixIfIn(word.R2start, "iv")
			if newSuffix != "" {
				word.RemoveFirstSuffixIfIn(word.R2start, "at")
				return true
			}

			// If preceded by "eus", delete if in R2, else replace by "eux" if in R1
			newSuffix, newSuffixRunes = word.FirstSuffix("eus")
			if newSuffix != "" {
				newSuffixLen := len(newSuffixRunes)
				if word.FitsInR2(newSuffixLen) {
					word.RemoveLastNRunes(newSuffixLen)
				} else if word.FitsInR1(newSuffixLen) {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("eux"), true)
				}
				return true
			}

			// If preceded by abl or iqU, delete if in R2, otherwise,
			newSuffix, newSuffixRunes = word.FirstSuffix("abl", "iqU")
			if newSuffix != "" {
				newSuffixLen := len(newSuffixRunes)
				if word.FitsInR2(newSuffixLen) {
					word.RemoveLastNRunes(newSuffixLen)
				}
				return true
			}

			// If preceded by ièr or Ièr, replace by i if in RV
			newSuffix, newSuffixRunes = word.FirstSuffix("ièr", "Ièr")
			if newSuffix != "" {
				if word.FitsInRV(len(newSuffixRunes)) {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("i"), true)
				}
				return true
			}

			return true
		}

	case "ité", "ités":

		if isInR2 {

			// Delete if in R2
			word.RemoveLastNRunes(len(suffixRunes))

			// If preceded by "abil", delete if in R2, else replace by "abl"
			newSuffix, newSuffixRunes := word.FirstSuffix("abil")
			if newSuffix != "" {
				newSuffixLen := len(newSuffixRunes)
				if word.FitsInR2(newSuffixLen) {
					word.RemoveLastNRunes(newSuffixLen)
				} else {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("abl"), true)
				}
				return true
			}

			// If preceded by "ic", delete if in R2, else replace by "iqU"
			newSuffix, newSuffixRunes = word.FirstSuffix("ic")
			if newSuffix != "" {
				newSuffixLen := len(newSuffixRunes)
				if word.FitsInR2(newSuffixLen) {
					word.RemoveLastNRunes(newSuffixLen)
				} else {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true)
				}
				return true
			}

			// If preceded by "iv", delete if in R2
			newSuffix, newSuffixRunes = word.RemoveFirstSuffixIfIn(word.R2start, "iv")
			return true
		}
	case "if", "ive", "ifs", "ives":

		if isInR2 {

			// Delete if in R2
			word.RemoveLastNRunes(len(suffixRunes))

			// If preceded by at, delete if in R2
			newSuffix, newSuffixRunes := word.RemoveFirstSuffixIfIn(word.R2start, "at")
			if newSuffix != "" {

				// And if further preceded by ic, delete if in R2, else replace by iqU
				newSuffix, newSuffixRunes = word.FirstSuffix("ic")
				if newSuffix != "" {
					newSuffixLen := len(newSuffixRunes)
					if word.FitsInR2(newSuffixLen) {
						word.RemoveLastNRunes(newSuffixLen)
					} else {
						word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true)
					}
				}
			}
			return true

		}
	}
	return false
}
Exemple #30
0
// Step 0 is the removal of attached pronouns
//
func step0(word *snowballword.SnowballWord) bool {

	// Search for the longest among the following suffixes
	suffix1, suffix1Runes := word.FirstSuffixIn(word.RVstart, len(word.RS),
		"selas", "selos", "sela", "selo", "las", "les",
		"los", "nos", "me", "se", "la", "le", "lo",
	)

	// If the suffix empty or not in RV, we have nothing to do.
	if suffix1 == "" {
		return false
	}

	// We'll remove suffix1, if comes after one of the following
	suffix2, suffix2Runes := word.FirstSuffixIn(word.RVstart, len(word.RS)-len(suffix1),
		"iéndo", "iendo", "yendo", "ando", "ándo",
		"ár", "ér", "ír", "ar", "er", "ir",
	)
	switch suffix2 {
	case "":

		// Nothing to do
		return false

	case "iéndo", "ándo", "ár", "ér", "ír":

		// In these cases, deletion is followed by removing
		// the acute accent (e.g., haciéndola -> haciendo).

		var suffix2repl string
		switch suffix2 {
		case "":
			return false
		case "iéndo":
			suffix2repl = "iendo"
		case "ándo":
			suffix2repl = "ando"
		case "ár":
			suffix2repl = "ar"
		case "ír":
			suffix2repl = "ir"
		}
		word.RemoveLastNRunes(len(suffix1Runes))
		word.ReplaceSuffixRunes(suffix2Runes, []rune(suffix2repl), true)
		return true

	case "ando", "iendo", "ar", "er", "ir":
		word.RemoveLastNRunes(len(suffix1Runes))
		return true

	case "yendo":

		// In the case of "yendo", the "yendo" must lie in RV,
		// and be preceded by a "u" somewhere in the word.

		for i := 0; i < len(word.RS)-(len(suffix1)+len(suffix2)); i++ {

			// Note, the unicode code point for "u" is 117.
			if word.RS[i] == 117 {
				word.RemoveLastNRunes(len(suffix1Runes))
				return true
			}
		}
	}
	return false
}