func SplitIntoWords(input string) []Word { fields := strings.Fields(input) // split words like '2-point', '1-inch' into two words ('2', 'pound') mapped := make([]string, 0, len(fields)) for _, word := range fields { trimmed := strings.TrimFunc(word, split.IsTrimmableRune) if dash := strings.IndexRune(trimmed, '-'); dash >= 0 { left := trimmed[:dash] right := trimmed[dash+1:] // fmt.Printf("SplitIntoWords dashed %#v, left = %#v, right = %#v, reg(left) = %#v, reg(right) = %#v, num(left) = %#v\n", word, left, right, split.IsRegularWord(left), split.IsRegularWord(right), ClassifyNumberString(left)) if !split.IsRegularWord(left) && split.IsRegularWord(right) && ClassifyNumberString(left) != NumClassNone { idx := strings.Index(word, left) if idx < 0 { panic(fmt.Sprintf("Cannot find %#v in %#v", left, word)) } dash := idx + len(left) fullLeft := word[:dash] fullRight := word[dash+1:] mapped = append(mapped, fullLeft) mapped = append(mapped, fullRight) continue } } mapped = append(mapped, word) } words := make([]Word, 0, len(mapped)) for _, word := range mapped { trimmed := strings.TrimFunc(word, split.IsTrimmableRune) trimmed = cleanupWord(trimmed) if len(trimmed) == 0 { continue } norm := Normalize(trimmed) stem := Stem(trimmed) // println(trimmed, "=>", stem) words = append(words, Word{Raw: word, Trimmed: trimmed, Stem: stem, Normalized: norm}) } return words }
func countIrregulars(words []string) int { c := 0 for _, word := range words { if !split.IsRegularWord(word) { c++ } } return c }
func (c *Classifier) Process(input string) *Result { r := new(Result) r.multiVariantTags = c.multiVariantTags r.Words = SplitIntoWords(input) tagCount := 10 //len(c.categories) r.TagsByName = make(map[string][]Range, tagCount) r.TagsByPos = make([]map[string][]Range, len(r.Words)) for pos, _ := range r.Words { r.TagsByPos[pos] = make(map[string][]Range, tagCount) } r.TagDefs = c.TagDefs r.TagDefsByName = c.TagDefsByName // handle built-in tags for pos, word := range r.Words { s := word.Trimmed runes := []rune(s) if len(runes) == 0 { continue } if unicode.IsUpper(runes[0]) && strings.IndexFunc(s, unicode.IsLower) >= 1 { r.AddTag("@cap", pos, 1) } if runes[0] == '@' { r.AddTag("@twitter", pos, 1) } else if strings.IndexRune(s, '.') >= 1 && strings.IndexRune(s, '/') >= 1 { r.AddTag("@url", pos, 1) } else if strings.IndexRune(s, '@') >= 1 && strings.IndexRune(s, '.') >= 1 { r.AddTag("@email", pos, 1) } nc := ClassifyNumber(runes) switch nc { case NumClassInteger: r.AddTag("@integer", pos, 1) case NumClassFloat: r.AddTag("@float", pos, 1) case NumClassFraction: r.AddTag("@fraction", pos, 1) case NumClassCurrency: r.AddTag("@currency-number", pos, 1) case NumClassNone: if !split.IsRegularWord(word.Trimmed) || IsStopWord(word.Stem) { r.AddTag("@s", pos, 1) } } } // the core: add tags for matched categories for _, category := range c.categories { // make a list of skip options (each value in skippable is the // number of words that can be skipped, ending at this position) skippable := make([][]int, len(r.Words)) for wi, _ := range r.Words { for _, tag := range category.skippableTags { for _, tagging := range r.TagsByPos[wi][tag] { if !intSliceContainsValue(skippable[wi], tagging.Len) { last := wi + tagging.Len - 1 skippable[last] = append(skippable[last], tagging.Len) } } } } for _, scheme := range category.schemes { matchScheme(r, category.tag, scheme.requirements, skippable, category.skipBefore, category.skipAfter) } } return r }