예제 #1
0
파일: step1.go 프로젝트: kljensen/snowball
// Step 1 is the removal of standard suffixes
//
func step1(word *snowballword.SnowballWord) bool {
	suffix, suffixRunes := word.FirstSuffix(
		"issements", "issement", "atrices", "utions", "usions", "logies",
		"emment", "ements", "atrice", "ations", "ateurs", "amment", "ution",
		"usion", "ments", "logie", "istes", "ismes", "iqUes", "euses",
		"ences", "ement", "ation", "ateur", "ances", "ables", "ment",
		"ités", "iste", "isme", "iqUe", "euse", "ence", "eaux", "ance",
		"able", "ives", "ité", "eux", "aux", "ive", "ifs", "if",
	)

	if suffix == "" {
		return false
	}

	isInR1 := (word.R1start <= len(word.RS)-len(suffixRunes))
	isInR2 := (word.R2start <= len(word.RS)-len(suffixRunes))
	isInRV := (word.RVstart <= len(word.RS)-len(suffixRunes))

	// Handle simple replacements & deletions in R2 first
	if isInR2 {

		// Handle simple replacements in R2
		repl := ""
		switch suffix {
		case "logie", "logies":
			repl = "log"
		case "usion", "ution", "usions", "utions":
			repl = "u"
		case "ence", "ences":
			repl = "ent"
		}
		if repl != "" {
			word.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
			return true
		}

		// Handle simple deletions in R2
		switch suffix {
		case "ance", "iqUe", "isme", "able", "iste", "eux", "ances", "iqUes", "ismes", "ables", "istes":
			word.RemoveLastNRunes(len(suffixRunes))
			return true
		}
	}

	// Handle simple replacements in RV
	if isInRV {

		// NOTE: these are "special" suffixes in that
		// we must still do steps 2a and 2b of the
		// French stemmer even when these suffixes are
		// found in step1.  Therefore, we are returning
		// `false` here.

		repl := ""
		switch suffix {
		case "amment":
			repl = "ant"
		case "emment":
			repl = "ent"
		}
		if repl != "" {
			word.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
			return false
		}

		// Delete if preceded by a vowel that is also in RV
		if suffix == "ment" || suffix == "ments" {
			idx := len(word.RS) - len(suffixRunes) - 1
			if idx >= word.RVstart && isLowerVowel(word.RS[idx]) {
				word.RemoveLastNRunes(len(suffixRunes))
				return false
			}
			return false
		}
	}

	// Handle all the other "special" cases.  All of these
	// return true immediately after changing the word.
	//
	switch suffix {
	case "eaux":

		// Replace with eau
		word.ReplaceSuffixRunes(suffixRunes, []rune("eau"), true)
		return true

	case "aux":

		// Replace with al if in R1
		if isInR1 {
			word.ReplaceSuffixRunes(suffixRunes, []rune("al"), true)
			return true
		}

	case "euse", "euses":

		// Delete if in R2, else replace by eux if in R1
		if isInR2 {
			word.RemoveLastNRunes(len(suffixRunes))
			return true
		} else if isInR1 {
			word.ReplaceSuffixRunes(suffixRunes, []rune("eux"), true)
			return true
		}

	case "issement", "issements":

		// Delete if in R1 and preceded by a non-vowel
		if isInR1 {
			idx := len(word.RS) - len(suffixRunes) - 1
			if idx >= 0 && isLowerVowel(word.RS[idx]) == false {
				word.RemoveLastNRunes(len(suffixRunes))
				return true
			}
		}
		return false

	case "atrice", "ateur", "ation", "atrices", "ateurs", "ations":

		// Delete if in R2
		if isInR2 {
			word.RemoveLastNRunes(len(suffixRunes))

			// If preceded by "ic", delete if in R2, else replace by "iqU".
			newSuffix, newSuffixRunes := word.FirstSuffix("ic")
			if newSuffix != "" {
				if word.FitsInR2(len(newSuffixRunes)) {
					word.RemoveLastNRunes(len(newSuffixRunes))
				} else {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true)
				}
			}
			return true
		}

	case "ement", "ements":

		if isInRV {

			// Delete if in RV
			word.RemoveLastNRunes(len(suffixRunes))

			// If preceded by "iv", delete if in R2
			// (and if further preceded by "at", delete if in R2)
			newSuffix, newSuffixRunes := word.RemoveFirstSuffixIfIn(word.R2start, "iv")
			if newSuffix != "" {
				word.RemoveFirstSuffixIfIn(word.R2start, "at")
				return true
			}

			// If preceded by "eus", delete if in R2, else replace by "eux" if in R1
			newSuffix, newSuffixRunes = word.FirstSuffix("eus")
			if newSuffix != "" {
				newSuffixLen := len(newSuffixRunes)
				if word.FitsInR2(newSuffixLen) {
					word.RemoveLastNRunes(newSuffixLen)
				} else if word.FitsInR1(newSuffixLen) {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("eux"), true)
				}
				return true
			}

			// If preceded by abl or iqU, delete if in R2, otherwise,
			newSuffix, newSuffixRunes = word.FirstSuffix("abl", "iqU")
			if newSuffix != "" {
				newSuffixLen := len(newSuffixRunes)
				if word.FitsInR2(newSuffixLen) {
					word.RemoveLastNRunes(newSuffixLen)
				}
				return true
			}

			// If preceded by ièr or Ièr, replace by i if in RV
			newSuffix, newSuffixRunes = word.FirstSuffix("ièr", "Ièr")
			if newSuffix != "" {
				if word.FitsInRV(len(newSuffixRunes)) {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("i"), true)
				}
				return true
			}

			return true
		}

	case "ité", "ités":

		if isInR2 {

			// Delete if in R2
			word.RemoveLastNRunes(len(suffixRunes))

			// If preceded by "abil", delete if in R2, else replace by "abl"
			newSuffix, newSuffixRunes := word.FirstSuffix("abil")
			if newSuffix != "" {
				newSuffixLen := len(newSuffixRunes)
				if word.FitsInR2(newSuffixLen) {
					word.RemoveLastNRunes(newSuffixLen)
				} else {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("abl"), true)
				}
				return true
			}

			// If preceded by "ic", delete if in R2, else replace by "iqU"
			newSuffix, newSuffixRunes = word.FirstSuffix("ic")
			if newSuffix != "" {
				newSuffixLen := len(newSuffixRunes)
				if word.FitsInR2(newSuffixLen) {
					word.RemoveLastNRunes(newSuffixLen)
				} else {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true)
				}
				return true
			}

			// If preceded by "iv", delete if in R2
			newSuffix, newSuffixRunes = word.RemoveFirstSuffixIfIn(word.R2start, "iv")
			return true
		}
	case "if", "ive", "ifs", "ives":

		if isInR2 {

			// Delete if in R2
			word.RemoveLastNRunes(len(suffixRunes))

			// If preceded by at, delete if in R2
			newSuffix, newSuffixRunes := word.RemoveFirstSuffixIfIn(word.R2start, "at")
			if newSuffix != "" {

				// And if further preceded by ic, delete if in R2, else replace by iqU
				newSuffix, newSuffixRunes = word.FirstSuffix("ic")
				if newSuffix != "" {
					newSuffixLen := len(newSuffixRunes)
					if word.FitsInR2(newSuffixLen) {
						word.RemoveLastNRunes(newSuffixLen)
					} else {
						word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true)
					}
				}
			}
			return true

		}
	}
	return false
}