Esempio n. 1
0
// Step 1 is the removal of standard suffixes
//
func step1(word *snowballword.SnowballWord) bool {
	suffix, suffixRunes := word.FirstSuffix(
		"issements", "issement", "atrices", "utions", "usions", "logies",
		"emment", "ements", "atrice", "ations", "ateurs", "amment", "ution",
		"usion", "ments", "logie", "istes", "ismes", "iqUes", "euses",
		"ences", "ement", "ation", "ateur", "ances", "ables", "ment",
		"ités", "iste", "isme", "iqUe", "euse", "ence", "eaux", "ance",
		"able", "ives", "ité", "eux", "aux", "ive", "ifs", "if",
	)

	if suffix == "" {
		return false
	}

	isInR1 := (word.R1start <= len(word.RS)-len(suffixRunes))
	isInR2 := (word.R2start <= len(word.RS)-len(suffixRunes))
	isInRV := (word.RVstart <= len(word.RS)-len(suffixRunes))

	// Handle simple replacements & deletions in R2 first
	if isInR2 {

		// Handle simple replacements in R2
		repl := ""
		switch suffix {
		case "logie", "logies":
			repl = "log"
		case "usion", "ution", "usions", "utions":
			repl = "u"
		case "ence", "ences":
			repl = "ent"
		}
		if repl != "" {
			word.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
			return true
		}

		// Handle simple deletions in R2
		switch suffix {
		case "ance", "iqUe", "isme", "able", "iste", "eux", "ances", "iqUes", "ismes", "ables", "istes":
			word.RemoveLastNRunes(len(suffixRunes))
			return true
		}
	}

	// Handle simple replacements in RV
	if isInRV {

		// NOTE: these are "special" suffixes in that
		// we must still do steps 2a and 2b of the
		// French stemmer even when these suffixes are
		// found in step1.  Therefore, we are returning
		// `false` here.

		repl := ""
		switch suffix {
		case "amment":
			repl = "ant"
		case "emment":
			repl = "ent"
		}
		if repl != "" {
			word.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
			return false
		}

		// Delete if preceded by a vowel that is also in RV
		if suffix == "ment" || suffix == "ments" {
			idx := len(word.RS) - len(suffixRunes) - 1
			if idx >= word.RVstart && isLowerVowel(word.RS[idx]) {
				word.RemoveLastNRunes(len(suffixRunes))
				return false
			}
			return false
		}
	}

	// Handle all the other "special" cases.  All of these
	// return true immediately after changing the word.
	//
	switch suffix {
	case "eaux":

		// Replace with eau
		word.ReplaceSuffixRunes(suffixRunes, []rune("eau"), true)
		return true

	case "aux":

		// Replace with al if in R1
		if isInR1 {
			word.ReplaceSuffixRunes(suffixRunes, []rune("al"), true)
			return true
		}

	case "euse", "euses":

		// Delete if in R2, else replace by eux if in R1
		if isInR2 {
			word.RemoveLastNRunes(len(suffixRunes))
			return true
		} else if isInR1 {
			word.ReplaceSuffixRunes(suffixRunes, []rune("eux"), true)
			return true
		}

	case "issement", "issements":

		// Delete if in R1 and preceded by a non-vowel
		if isInR1 {
			idx := len(word.RS) - len(suffixRunes) - 1
			if idx >= 0 && isLowerVowel(word.RS[idx]) == false {
				word.RemoveLastNRunes(len(suffixRunes))
				return true
			}
		}
		return false

	case "atrice", "ateur", "ation", "atrices", "ateurs", "ations":

		// Delete if in R2
		if isInR2 {
			word.RemoveLastNRunes(len(suffixRunes))

			// If preceded by "ic", delete if in R2, else replace by "iqU".
			newSuffix, newSuffixRunes := word.FirstSuffix("ic")
			if newSuffix != "" {
				if word.FitsInR2(len(newSuffixRunes)) {
					word.RemoveLastNRunes(len(newSuffixRunes))
				} else {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true)
				}
			}
			return true
		}

	case "ement", "ements":

		if isInRV {

			// Delete if in RV
			word.RemoveLastNRunes(len(suffixRunes))

			// If preceded by "iv", delete if in R2
			// (and if further preceded by "at", delete if in R2)
			newSuffix, newSuffixRunes := word.RemoveFirstSuffixIfIn(word.R2start, "iv")
			if newSuffix != "" {
				word.RemoveFirstSuffixIfIn(word.R2start, "at")
				return true
			}

			// If preceded by "eus", delete if in R2, else replace by "eux" if in R1
			newSuffix, newSuffixRunes = word.FirstSuffix("eus")
			if newSuffix != "" {
				newSuffixLen := len(newSuffixRunes)
				if word.FitsInR2(newSuffixLen) {
					word.RemoveLastNRunes(newSuffixLen)
				} else if word.FitsInR1(newSuffixLen) {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("eux"), true)
				}
				return true
			}

			// If preceded by abl or iqU, delete if in R2, otherwise,
			newSuffix, newSuffixRunes = word.FirstSuffix("abl", "iqU")
			if newSuffix != "" {
				newSuffixLen := len(newSuffixRunes)
				if word.FitsInR2(newSuffixLen) {
					word.RemoveLastNRunes(newSuffixLen)
				}
				return true
			}

			// If preceded by ièr or Ièr, replace by i if in RV
			newSuffix, newSuffixRunes = word.FirstSuffix("ièr", "Ièr")
			if newSuffix != "" {
				if word.FitsInRV(len(newSuffixRunes)) {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("i"), true)
				}
				return true
			}

			return true
		}

	case "ité", "ités":

		if isInR2 {

			// Delete if in R2
			word.RemoveLastNRunes(len(suffixRunes))

			// If preceded by "abil", delete if in R2, else replace by "abl"
			newSuffix, newSuffixRunes := word.FirstSuffix("abil")
			if newSuffix != "" {
				newSuffixLen := len(newSuffixRunes)
				if word.FitsInR2(newSuffixLen) {
					word.RemoveLastNRunes(newSuffixLen)
				} else {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("abl"), true)
				}
				return true
			}

			// If preceded by "ic", delete if in R2, else replace by "iqU"
			newSuffix, newSuffixRunes = word.FirstSuffix("ic")
			if newSuffix != "" {
				newSuffixLen := len(newSuffixRunes)
				if word.FitsInR2(newSuffixLen) {
					word.RemoveLastNRunes(newSuffixLen)
				} else {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true)
				}
				return true
			}

			// If preceded by "iv", delete if in R2
			newSuffix, newSuffixRunes = word.RemoveFirstSuffixIfIn(word.R2start, "iv")
			return true
		}
	case "if", "ive", "ifs", "ives":

		if isInR2 {

			// Delete if in R2
			word.RemoveLastNRunes(len(suffixRunes))

			// If preceded by at, delete if in R2
			newSuffix, newSuffixRunes := word.RemoveFirstSuffixIfIn(word.R2start, "at")
			if newSuffix != "" {

				// And if further preceded by ic, delete if in R2, else replace by iqU
				newSuffix, newSuffixRunes = word.FirstSuffix("ic")
				if newSuffix != "" {
					newSuffixLen := len(newSuffixRunes)
					if word.FitsInR2(newSuffixLen) {
						word.RemoveLastNRunes(newSuffixLen)
					} else {
						word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true)
					}
				}
			}
			return true

		}
	}
	return false
}
Esempio n. 2
0
// Step 1 is the removal of standard suffixes
//
func step1(word *snowballword.SnowballWord) bool {

	// Possible suffixes, longest first
	suffix, suffixRunes := word.FirstSuffix(
		"amientos", "imientos", "aciones", "amiento", "imiento",
		"uciones", "logías", "idades", "encias", "ancias", "amente",
		"adores", "adoras", "ución", "mente", "logía", "istas",
		"ismos", "ibles", "encia", "anzas", "antes", "ancia",
		"adora", "ación", "ables", "osos", "osas", "ivos", "ivas",
		"ista", "ismo", "idad", "icos", "icas", "ible", "anza",
		"ante", "ador", "able", "oso", "osa", "ivo", "iva",
		"ico", "ica",
	)

	isInR1 := (word.R1start <= len(word.RS)-len(suffixRunes))
	isInR2 := (word.R2start <= len(word.RS)-len(suffixRunes))

	// Deal with special cases first.  All of these will
	// return if they are hit.
	//
	switch suffix {
	case "":

		// Nothing to do
		return false

	case "amente":

		if isInR1 {
			// Delete if in R1
			word.RemoveLastNRunes(len(suffixRunes))

			// if preceded by iv, delete if in R2 (and if further preceded by at,
			// delete if in R2), otherwise,
			// if preceded by os, ic or ad, delete if in R2
			newSuffix, _ := word.RemoveFirstSuffixIfIn(word.R2start, "iv", "os", "ic", "ad")
			if newSuffix == "iv" {
				word.RemoveFirstSuffixIfIn(word.R2start, "at")
			}
			return true
		}
		return false
	}

	// All the following cases require the found suffix
	// to be in R2.
	if isInR2 == false {
		return false
	}

	// Compound replacement cases.  All these cases return
	// if they are hit.
	//
	compoundReplacement := func(otherSuffixes ...string) bool {
		word.RemoveLastNRunes(len(suffixRunes))
		word.RemoveFirstSuffixIfIn(word.R2start, otherSuffixes...)
		return true
	}

	switch suffix {
	case "adora", "ador", "ación", "adoras", "adores", "aciones", "ante", "antes", "ancia", "ancias":
		return compoundReplacement("ic")
	case "mente":
		return compoundReplacement("ante", "able", "ible")
	case "idad", "idades":
		return compoundReplacement("abil", "ic", "iv")
	case "iva", "ivo", "ivas", "ivos":
		return compoundReplacement("at")
	}

	// Simple replacement & deletion cases are all that remain.
	//
	simpleReplacement := func(repl string) bool {
		word.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
		return true
	}
	switch suffix {
	case "logía", "logías":
		return simpleReplacement("log")
	case "ución", "uciones":
		return simpleReplacement("u")
	case "encia", "encias":
		return simpleReplacement("ente")
	case "anza", "anzas", "ico", "ica", "icos", "icas",
		"ismo", "ismos", "able", "ables", "ible", "ibles",
		"ista", "istas", "oso", "osa", "osos", "osas",
		"amiento", "amientos", "imiento", "imientos":
		word.RemoveLastNRunes(len(suffixRunes))
		return true
	}

	log.Panicln("Unhandled suffix:", suffix)
	return false
}