예제 #1
0
파일: step2b.go 프로젝트: kljensen/snowball
// Step 2b is the removal of Verb suffixes in RV
// that do not begin with "i".
//
func step2b(word *snowballword.SnowballWord) bool {

	// Search for the longest among the following suffixes in RV.
	//
	suffix, suffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS),
		"eraIent", "assions", "erions", "assiez", "assent",
		"èrent", "eront", "erons", "eriez", "erait", "erais",
		"asses", "antes", "aIent", "âtes", "âmes", "ions",
		"erez", "eras", "erai", "asse", "ants", "ante", "ées",
		"iez", "era", "ant", "ait", "ais", "és", "ée", "ât",
		"ez", "er", "as", "ai", "é", "a",
	)

	switch suffix {
	case "ions":

		// Delete if in R2
		suffixLen := len(suffixRunes)
		if word.FitsInR2(suffixLen) {
			word.RemoveLastNRunes(suffixLen)
			return true
		}
		return false

	case "é", "ée", "ées", "és", "èrent", "er", "era",
		"erai", "eraIent", "erais", "erait", "eras", "erez",
		"eriez", "erions", "erons", "eront", "ez", "iez":

		// Delete
		word.RemoveLastNRunes(len(suffixRunes))
		return true

	case "âmes", "ât", "âtes", "a", "ai", "aIent",
		"ais", "ait", "ant", "ante", "antes", "ants", "as",
		"asse", "assent", "asses", "assiez", "assions":

		// Delete
		word.RemoveLastNRunes(len(suffixRunes))

		// If preceded by e (unicode code point 101), delete
		//
		idx := len(word.RS) - 1
		if idx >= 0 && word.RS[idx] == 101 && word.FitsInRV(1) {
			word.RemoveLastNRunes(1)
		}
		return true

	}
	return false
}
예제 #2
0
파일: step2a.go 프로젝트: kljensen/snowball
// Step 2a is the removal of Verb suffixes beginning
// with "i" in the RV region.
//
func step2a(word *snowballword.SnowballWord) bool {

	// Search for the longest among the following suffixes
	// in RV and if found, delete if preceded by a non-vowel.

	suffix, suffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS),
		"issantes", "issaIent", "issions", "issants", "issante",
		"iraIent", "issons", "issiez", "issent", "issant", "issait",
		"issais", "irions", "issez", "isses", "iront", "irons", "iriez",
		"irent", "irait", "irais", "îtes", "îmes", "isse", "irez",
		"iras", "irai", "ira", "ies", "ît", "it", "is", "ir", "ie", "i",
	)
	if suffix != "" {
		sLen := len(suffixRunes)
		idx := len(word.RS) - sLen - 1
		if idx >= 0 && word.FitsInRV(sLen+1) && isLowerVowel(word.RS[idx]) == false {
			word.RemoveLastNRunes(len(suffixRunes))
			return true
		}
	}
	return false
}
예제 #3
0
파일: step4.go 프로젝트: kljensen/snowball
// Step 4 is the cleaning up of residual suffixes.
//
func step4(word *snowballword.SnowballWord) bool {

	hadChange := false

	if word.String() == "voudrion" {
		log.Println("...", word)
	}

	// If the word ends s (unicode code point 115),
	// not preceded by a, i, o, u, è or s, delete it.
	//
	if idx := len(word.RS) - 1; idx >= 1 && word.RS[idx] == 115 {
		switch word.RS[idx-1] {

		case 97, 105, 111, 117, 232, 115:

			// Do nothing, preceded by a, i, o, u, è or s
			return false

		default:
			word.RemoveLastNRunes(1)
			hadChange = true

		}
	}

	// Note: all the following are restricted to the RV region.

	// Search for the longest among the following suffixes in RV.
	//
	suffix, suffixRunes := word.FirstSuffixIn(word.RVstart, len(word.RS),
		"Ière", "ière", "Ier", "ier", "ion", "e", "ë",
	)

	switch suffix {
	case "":
		return hadChange
	case "ion":

		// Delete if in R2 and preceded by s or t in RV

		const sLen int = 3 // equivalently, len(suffixRunes)
		idx := len(word.RS) - sLen - 1
		if word.FitsInR2(sLen) && idx >= 0 && word.FitsInRV(sLen+1) {
			if word.RS[idx] == 115 || word.RS[idx] == 116 {
				word.RemoveLastNRunes(sLen)
				return true
			}
		}
		return hadChange

	case "ier", "ière", "Ier", "Ière":
		// Replace with i
		word.ReplaceSuffixRunes(suffixRunes, []rune("i"), true)
		return true

	case "e":
		word.RemoveLastNRunes(1)
		return true

	case "ë":

		// If preceded by gu (unicode code point 103 & 117), delete
		idx := len(word.RS) - 1
		if idx >= 2 && word.RS[idx-2] == 103 && word.RS[idx-1] == 117 {
			word.RemoveLastNRunes(1)
			return true
		}
		return hadChange
	}

	return true
}
예제 #4
0
파일: step1.go 프로젝트: kljensen/snowball
// Step 1 is the removal of standard suffixes
//
func step1(word *snowballword.SnowballWord) bool {
	suffix, suffixRunes := word.FirstSuffix(
		"issements", "issement", "atrices", "utions", "usions", "logies",
		"emment", "ements", "atrice", "ations", "ateurs", "amment", "ution",
		"usion", "ments", "logie", "istes", "ismes", "iqUes", "euses",
		"ences", "ement", "ation", "ateur", "ances", "ables", "ment",
		"ités", "iste", "isme", "iqUe", "euse", "ence", "eaux", "ance",
		"able", "ives", "ité", "eux", "aux", "ive", "ifs", "if",
	)

	if suffix == "" {
		return false
	}

	isInR1 := (word.R1start <= len(word.RS)-len(suffixRunes))
	isInR2 := (word.R2start <= len(word.RS)-len(suffixRunes))
	isInRV := (word.RVstart <= len(word.RS)-len(suffixRunes))

	// Handle simple replacements & deletions in R2 first
	if isInR2 {

		// Handle simple replacements in R2
		repl := ""
		switch suffix {
		case "logie", "logies":
			repl = "log"
		case "usion", "ution", "usions", "utions":
			repl = "u"
		case "ence", "ences":
			repl = "ent"
		}
		if repl != "" {
			word.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
			return true
		}

		// Handle simple deletions in R2
		switch suffix {
		case "ance", "iqUe", "isme", "able", "iste", "eux", "ances", "iqUes", "ismes", "ables", "istes":
			word.RemoveLastNRunes(len(suffixRunes))
			return true
		}
	}

	// Handle simple replacements in RV
	if isInRV {

		// NOTE: these are "special" suffixes in that
		// we must still do steps 2a and 2b of the
		// French stemmer even when these suffixes are
		// found in step1.  Therefore, we are returning
		// `false` here.

		repl := ""
		switch suffix {
		case "amment":
			repl = "ant"
		case "emment":
			repl = "ent"
		}
		if repl != "" {
			word.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
			return false
		}

		// Delete if preceded by a vowel that is also in RV
		if suffix == "ment" || suffix == "ments" {
			idx := len(word.RS) - len(suffixRunes) - 1
			if idx >= word.RVstart && isLowerVowel(word.RS[idx]) {
				word.RemoveLastNRunes(len(suffixRunes))
				return false
			}
			return false
		}
	}

	// Handle all the other "special" cases.  All of these
	// return true immediately after changing the word.
	//
	switch suffix {
	case "eaux":

		// Replace with eau
		word.ReplaceSuffixRunes(suffixRunes, []rune("eau"), true)
		return true

	case "aux":

		// Replace with al if in R1
		if isInR1 {
			word.ReplaceSuffixRunes(suffixRunes, []rune("al"), true)
			return true
		}

	case "euse", "euses":

		// Delete if in R2, else replace by eux if in R1
		if isInR2 {
			word.RemoveLastNRunes(len(suffixRunes))
			return true
		} else if isInR1 {
			word.ReplaceSuffixRunes(suffixRunes, []rune("eux"), true)
			return true
		}

	case "issement", "issements":

		// Delete if in R1 and preceded by a non-vowel
		if isInR1 {
			idx := len(word.RS) - len(suffixRunes) - 1
			if idx >= 0 && isLowerVowel(word.RS[idx]) == false {
				word.RemoveLastNRunes(len(suffixRunes))
				return true
			}
		}
		return false

	case "atrice", "ateur", "ation", "atrices", "ateurs", "ations":

		// Delete if in R2
		if isInR2 {
			word.RemoveLastNRunes(len(suffixRunes))

			// If preceded by "ic", delete if in R2, else replace by "iqU".
			newSuffix, newSuffixRunes := word.FirstSuffix("ic")
			if newSuffix != "" {
				if word.FitsInR2(len(newSuffixRunes)) {
					word.RemoveLastNRunes(len(newSuffixRunes))
				} else {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true)
				}
			}
			return true
		}

	case "ement", "ements":

		if isInRV {

			// Delete if in RV
			word.RemoveLastNRunes(len(suffixRunes))

			// If preceded by "iv", delete if in R2
			// (and if further preceded by "at", delete if in R2)
			newSuffix, newSuffixRunes := word.RemoveFirstSuffixIfIn(word.R2start, "iv")
			if newSuffix != "" {
				word.RemoveFirstSuffixIfIn(word.R2start, "at")
				return true
			}

			// If preceded by "eus", delete if in R2, else replace by "eux" if in R1
			newSuffix, newSuffixRunes = word.FirstSuffix("eus")
			if newSuffix != "" {
				newSuffixLen := len(newSuffixRunes)
				if word.FitsInR2(newSuffixLen) {
					word.RemoveLastNRunes(newSuffixLen)
				} else if word.FitsInR1(newSuffixLen) {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("eux"), true)
				}
				return true
			}

			// If preceded by abl or iqU, delete if in R2, otherwise,
			newSuffix, newSuffixRunes = word.FirstSuffix("abl", "iqU")
			if newSuffix != "" {
				newSuffixLen := len(newSuffixRunes)
				if word.FitsInR2(newSuffixLen) {
					word.RemoveLastNRunes(newSuffixLen)
				}
				return true
			}

			// If preceded by ièr or Ièr, replace by i if in RV
			newSuffix, newSuffixRunes = word.FirstSuffix("ièr", "Ièr")
			if newSuffix != "" {
				if word.FitsInRV(len(newSuffixRunes)) {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("i"), true)
				}
				return true
			}

			return true
		}

	case "ité", "ités":

		if isInR2 {

			// Delete if in R2
			word.RemoveLastNRunes(len(suffixRunes))

			// If preceded by "abil", delete if in R2, else replace by "abl"
			newSuffix, newSuffixRunes := word.FirstSuffix("abil")
			if newSuffix != "" {
				newSuffixLen := len(newSuffixRunes)
				if word.FitsInR2(newSuffixLen) {
					word.RemoveLastNRunes(newSuffixLen)
				} else {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("abl"), true)
				}
				return true
			}

			// If preceded by "ic", delete if in R2, else replace by "iqU"
			newSuffix, newSuffixRunes = word.FirstSuffix("ic")
			if newSuffix != "" {
				newSuffixLen := len(newSuffixRunes)
				if word.FitsInR2(newSuffixLen) {
					word.RemoveLastNRunes(newSuffixLen)
				} else {
					word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true)
				}
				return true
			}

			// If preceded by "iv", delete if in R2
			newSuffix, newSuffixRunes = word.RemoveFirstSuffixIfIn(word.R2start, "iv")
			return true
		}
	case "if", "ive", "ifs", "ives":

		if isInR2 {

			// Delete if in R2
			word.RemoveLastNRunes(len(suffixRunes))

			// If preceded by at, delete if in R2
			newSuffix, newSuffixRunes := word.RemoveFirstSuffixIfIn(word.R2start, "at")
			if newSuffix != "" {

				// And if further preceded by ic, delete if in R2, else replace by iqU
				newSuffix, newSuffixRunes = word.FirstSuffix("ic")
				if newSuffix != "" {
					newSuffixLen := len(newSuffixRunes)
					if word.FitsInR2(newSuffixLen) {
						word.RemoveLastNRunes(newSuffixLen)
					} else {
						word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true)
					}
				}
			}
			return true

		}
	}
	return false
}