Пример #1
0
func SplitIntoWords(input string) []Word {
	fields := strings.Fields(input)

	// split words like '2-point', '1-inch' into two words ('2', 'pound')
	mapped := make([]string, 0, len(fields))
	for _, word := range fields {
		trimmed := strings.TrimFunc(word, split.IsTrimmableRune)
		if dash := strings.IndexRune(trimmed, '-'); dash >= 0 {
			left := trimmed[:dash]
			right := trimmed[dash+1:]

			// fmt.Printf("SplitIntoWords dashed %#v, left = %#v, right = %#v, reg(left) = %#v, reg(right) = %#v, num(left) = %#v\n", word, left, right, split.IsRegularWord(left), split.IsRegularWord(right), ClassifyNumberString(left))

			if !split.IsRegularWord(left) && split.IsRegularWord(right) && ClassifyNumberString(left) != NumClassNone {
				idx := strings.Index(word, left)
				if idx < 0 {
					panic(fmt.Sprintf("Cannot find %#v in %#v", left, word))
				}
				dash := idx + len(left)
				fullLeft := word[:dash]
				fullRight := word[dash+1:]

				mapped = append(mapped, fullLeft)
				mapped = append(mapped, fullRight)
				continue
			}
		}
		mapped = append(mapped, word)
	}

	words := make([]Word, 0, len(mapped))
	for _, word := range mapped {
		trimmed := strings.TrimFunc(word, split.IsTrimmableRune)
		trimmed = cleanupWord(trimmed)
		if len(trimmed) == 0 {
			continue
		}

		norm := Normalize(trimmed)

		stem := Stem(trimmed)
		// println(trimmed, "=>", stem)
		words = append(words, Word{Raw: word, Trimmed: trimmed, Stem: stem, Normalized: norm})
	}
	return words
}
Пример #2
0
func countIrregulars(words []string) int {
	c := 0
	for _, word := range words {
		if !split.IsRegularWord(word) {
			c++
		}
	}
	return c
}
Пример #3
0
func (c *Classifier) Process(input string) *Result {
	r := new(Result)
	r.multiVariantTags = c.multiVariantTags

	r.Words = SplitIntoWords(input)

	tagCount := 10 //len(c.categories)
	r.TagsByName = make(map[string][]Range, tagCount)

	r.TagsByPos = make([]map[string][]Range, len(r.Words))
	for pos, _ := range r.Words {
		r.TagsByPos[pos] = make(map[string][]Range, tagCount)
	}

	r.TagDefs = c.TagDefs
	r.TagDefsByName = c.TagDefsByName

	// handle built-in tags
	for pos, word := range r.Words {
		s := word.Trimmed
		runes := []rune(s)
		if len(runes) == 0 {
			continue
		}

		if unicode.IsUpper(runes[0]) && strings.IndexFunc(s, unicode.IsLower) >= 1 {
			r.AddTag("@cap", pos, 1)
		}

		if runes[0] == '@' {
			r.AddTag("@twitter", pos, 1)
		} else if strings.IndexRune(s, '.') >= 1 && strings.IndexRune(s, '/') >= 1 {
			r.AddTag("@url", pos, 1)
		} else if strings.IndexRune(s, '@') >= 1 && strings.IndexRune(s, '.') >= 1 {
			r.AddTag("@email", pos, 1)
		}

		nc := ClassifyNumber(runes)
		switch nc {
		case NumClassInteger:
			r.AddTag("@integer", pos, 1)
		case NumClassFloat:
			r.AddTag("@float", pos, 1)
		case NumClassFraction:
			r.AddTag("@fraction", pos, 1)
		case NumClassCurrency:
			r.AddTag("@currency-number", pos, 1)
		case NumClassNone:
			if !split.IsRegularWord(word.Trimmed) || IsStopWord(word.Stem) {
				r.AddTag("@s", pos, 1)
			}
		}
	}

	// the core: add tags for matched categories
	for _, category := range c.categories {
		// make a list of skip options (each value in skippable is the
		// number of words that can be skipped, ending at this position)
		skippable := make([][]int, len(r.Words))
		for wi, _ := range r.Words {
			for _, tag := range category.skippableTags {
				for _, tagging := range r.TagsByPos[wi][tag] {
					if !intSliceContainsValue(skippable[wi], tagging.Len) {
						last := wi + tagging.Len - 1
						skippable[last] = append(skippable[last], tagging.Len)
					}
				}
			}
		}

		for _, scheme := range category.schemes {
			matchScheme(r, category.tag, scheme.requirements, skippable, category.skipBefore, category.skipAfter)
		}
	}

	return r
}