func NewKagomeMorphTokenizerWithUserDic(userdic tokenizer.UserDic) *KagomeMorphTokenizer { k := tokenizer.New() k.SetUserDic(userdic) return &KagomeMorphTokenizer{ tok: k, } }
// Match return true when text matches with rule(s). func Match(text string, rule []int) bool { t := tokenizer.New() text = reIgnoreText.ReplaceAllString(text, "") tokens := t.Tokenize(text) pos := 0 r := make([]int, len(rule)) copy(r, rule) for i := 0; i < len(tokens); i++ { tok := tokens[i] c := tok.Features() if len(c) == 0 { continue } y := c[len(c)-1] if !reWord.MatchString(y) { if y == "、" { continue } return false } if r[pos] == rule[pos] && !isWord(c) { return false } n := countChars(y) r[pos] -= n if r[pos] == 0 { pos++ if pos == len(r) && i == len(tokens)-2 { return true } } } return false }
func main() { t := tokenizer.New() tokens := t.Tokenize("寿司が食べたい。") // t.Analyze("寿司が食べたい。", tokenizer.Normal) for _, token := range tokens { if token.Class == tokenizer.DUMMY { // BOS: Begin Of Sentence, EOS: End Of Sentence. fmt.Printf("%s\n", token.Surface) continue } features := strings.Join(token.Features(), ",") fmt.Printf("%s\t%v\n", token.Surface, features) } }
func tokenize(sen string) string { t := tokenizer.New() tokens := t.Tokenize(sen) var buf bytes.Buffer fmt.Fprintln(&buf, "```") for i := 1; i < len(tokens); i++ { if tokens[i].Class == tokenizer.DUMMY { fmt.Fprintf(&buf, "%s\n", tokens[i].Surface) continue } features := strings.Join(tokens[i].Features(), ",") fmt.Fprintf(&buf, "%s\t%v\n", tokens[i].Surface, features) } fmt.Fprintln(&buf, "```") return buf.String() }
// command main func command(opt *option) error { t := tokenizer.New() var out = os.Stdout if opt.output != "" { var err error out, err = os.OpenFile(opt.output, os.O_RDWR|os.O_TRUNC|os.O_CREATE, 0666) if err != nil { fmt.Fprintln(ErrorWriter, err) os.Exit(1) } defer out.Close() } var udic tokenizer.UserDic if opt.udic != "" { var err error udic, err = tokenizer.NewUserDic(opt.udic) if err != nil { return err } t.SetUserDic(udic) } if opt.udic != "" { if udic, err := tokenizer.NewUserDic(opt.udic); err != nil { fmt.Fprintln(ErrorWriter, err) os.Exit(1) } else { t.SetUserDic(udic) } } tokens := t.Dot(opt.input, out) if opt.verbose { for i, size := 1, len(tokens); i < size; i++ { tok := tokens[i] f := tok.Features() if tok.Class == tokenizer.DUMMY { fmt.Fprintf(ErrorWriter, "%s\n", tok.Surface) } else { fmt.Fprintf(ErrorWriter, "%s\t%v\n", tok.Surface, strings.Join(f, ",")) } } } return nil }
// command main func command(opt *option) error { var udic tokenizer.UserDic if opt.udic != "" { var err error if udic, err = tokenizer.NewUserDic(opt.udic); err != nil { return err } } t := tokenizer.New() t.SetUserDic(udic) mux := http.NewServeMux() mux.Handle("/", &TokenizeDemoHandler{tokenizer: t}) mux.Handle("/a", &TokenizeHandler{tokenizer: t}) log.Fatal(http.ListenAndServe(opt.http, mux)) return nil }
// Tokenize runs tokenize string and return its tokens. func (m *MultilangTokenizer) Tokenize(s string) []string { switch m.lang { case language.Japanese: t := kagomeTokenizer.New() tokens := t.Tokenize(s) // tokens contains BOS and EOS as token. // so length is caliculated by despite of them. ret := make([]string, 0, len(tokens)-2) for _, token := range tokens { if token.Class != kagomeTokenizer.DUMMY { ret = append(ret, token.Surface) } } return ret default: // space separated language return strings.Split(s, " ") } }
// AnalizeTargets analize target strings func (a *Analyzer) AnalizeTargets() error { // 分かち書き if len(a.targets) == 0 { return errors.New("There are no targets") } t := tokenizer.New() for _, v := range a.targets { tokens := t.Tokenize(v) for _, token := range tokens { if token.Class != tokenizer.DUMMY { a.analyzedTargets = append(a.analyzedTargets, token.Surface) } } } return nil }
func (ctype *countType) countWordJp() int { tkn := tokenizer.New() morphs := tkn.Tokenize(ctype.text) var wc = 0 re1 := regexp.MustCompile("^[" + regexp.QuoteMeta("-[]{}()|") + ",.*+=_:;~!@#$%^&?`'/]+$") re2 := regexp.MustCompile("^[(){}「」『』[]、。]+$") for _, m := range morphs { word := m.Surface if word == "EOS" || word == "BOS" || re1.MatchString(word) || re2.MatchString(word) { continue } //fmt.Printf("%s %v\n", m.Surface, m.Features()) f := m.Features() class := f[0] if strings.Contains(class, "名詞") || strings.Contains(class, "形容詞") || strings.Contains(class, "動詞") || strings.Contains(class, "副詞") { } else { continue } if any(word, ctype.stopWords) { continue } if _, err := ctype.MapWordFreq[word]; err { ctype.MapWordFreq[word]++ } else { ctype.MapWordFreq[word] = 1 } wc++ } return wc }
func (b Bot) Tokenize(m Message) { sen := m.TextBody() t := tokenizer.New() tokens := t.Tokenize(sen) var buf bytes.Buffer fmt.Fprintln(&buf, "```") for i := 1; i < len(tokens); i++ { if tokens[i].Class == tokenizer.DUMMY { fmt.Fprintf(&buf, "%s\n", tokens[i].Surface) continue } features := strings.Join(tokens[i].Features(), ",") fmt.Fprintf(&buf, "%s\t%v\n", tokens[i].Surface, features) } fmt.Fprintln(&buf, "```") m.Text = buf.String() if e := b.PostMessage(m); e != nil { log.Printf("tokenize, post error, %v", e) } }
func main() { app := cli.NewApp() app.Name = Name app.Version = Version app.Author = "Sorami Hisamoto" app.Email = "" app.Usage = "Create Golang pun in Japanese." app.Flags = GlobalFlags app.Commands = Commands app.CommandNotFound = CommandNotFound app.Action = func(c *cli.Context) { scanner := bufio.NewScanner(os.Stdin) t := tokenizer.New() for scanner.Scan() { line := scanner.Text() tokens := t.Tokenize(line) for _, token := range tokens { if token.Class == tokenizer.DUMMY { continue } yomi := token.Features()[7] if yomi == "ゴラン" { fmt.Printf("Golang") } else if yomi == "ゴ" { fmt.Printf("Go") } else { fmt.Printf("%s", token.Surface) } } fmt.Printf("\n") } } app.Run(os.Args) }
func main() { var mongodbCredential string = os.Getenv("KINDLIZED_MONGODB") session, _ := mgo.Dial(mongodbCredential) defer session.Close() database := session.DB("kindlized") booksCollection := database.C("books") query := booksCollection.Find(bson.M{}) b := new(Book) query.One(&b) title := b.Title[0] t := tokenizer.New() tokens := t.Tokenize(title) for _, token := range tokens { if token.Class == tokenizer.KNOWN { log.Println(token.Surface) newWord := &Word{ ID: bson.NewObjectId(), Word: token.Surface, LastQuery: time.Now(), } q := database.C("words").Find(bson.M{ "word": token.Surface, }) count, _ := q.Count() if count == 0 { database.C("words").Insert(newWord) } } } // var books []Book // query.All(&books) // log.Println(books) }
/* * textを形態素解析をし、数を除く名詞のみ出現回数を数え * map[単語 string]回数int で返します。 * ※ map に格納される語は出現2回以上で、一部ストップワード除去。 */ func GetNounMapForMA(text string) map[string]int { res := make(map[string]int) t := tokenizer.New() tokens := t.Tokenize(text) for _, token := range tokens { features := strings.Join(token.Features(), ",") if strings.HasPrefix(features, "名詞") && !strings.Contains(features, "数") { w := fmt.Sprintf("%s", token.Surface) if res[w] == 0 { res[w] = 1 } else { res[w] += 1 } } } stopWord := []string{"http", "htm", "com", "co", "jp", ","://","/","&#",",",".","-","www", "&", ";", ":", ":&", ",'", "(", ")", "+", ";&", ";-"} set := mapset.NewSet() for _, v := range stopWord { set.Add(v) } for key, value := range res { if value == 1 || set.Contains(key) { delete(res, key) } } return res }
//テキストからsentenceオブジェクトを作る。 func getSentences(text string) []sentence { var sentences []sentence t := tokenizer.New() text = strings.Replace(text, "。", "\n", -1) text = strings.Replace(text, ".", "\n", -1) text = strings.Replace(text, "?", "?\n", -1) text = strings.Replace(text, "!", "!\n", -1) text = strings.Replace(text, "?", "?\n", -1) text = strings.Replace(text, "!", "!\n", -1) senstr := strings.Split(text, "\n") for i := 0; i < len(senstr); i++ { tokens := t.Tokenize(senstr[i]) var words []word var kana string for j := 0; j < len(tokens); j++ { tk := tokens[j] ft := tk.Features() if len(ft) > 7 { w := word{str: ft[6], kana: ft[7], wtype: ft[0], } words = append(words, w) kana += ft[7] } } sentences = append(sentences, sentence{ str: senstr[i], words: words, kana: kana, }) } return sentences }
// CrossParents cross parent cell's tweet func (f *Field) CrossParents(x, y int, newTweet string) (string, error) { // 親を交配する if x == 0 || x == f.SizeX-1 || y == 0 || y == f.SizeY-1 { return "", errors.New("Invalid cell.") } var tweets []string if f.Points[y-1][x-1].IsAlive { tweets = append(tweets, f.Points[y-1][x-1].Str) } if f.Points[y-1][x].IsAlive { tweets = append(tweets, f.Points[y-1][x].Str) } if f.Points[y-1][x+1].IsAlive { tweets = append(tweets, f.Points[y-1][x+1].Str) } if f.Points[y][x-1].IsAlive { tweets = append(tweets, f.Points[y][x-1].Str) } if f.Points[y][x+1].IsAlive { tweets = append(tweets, f.Points[y][x+1].Str) } if f.Points[y+1][x-1].IsAlive { tweets = append(tweets, f.Points[y+1][x-1].Str) } if f.Points[y+1][x].IsAlive { tweets = append(tweets, f.Points[y+1][x].Str) } if f.Points[y+1][x+1].IsAlive { tweets = append(tweets, f.Points[y+1][x+1].Str) } re1, err := regexp.Compile(`(^|\s)(@|https?://)\S+`) if err != nil { return "", err } re2, err := regexp.Compile(`^\s*|\s*$`) if err != nil { return "", err } t := tokenizer.New() newTweet = re2.ReplaceAllString(re1.ReplaceAllString(newTweet, ""), "") originTweetTokens := t.Tokenize(newTweet) var parentTweetsTokens [][]tokenizer.Token for i, v := range tweets { tweets[i] = re2.ReplaceAllString(re1.ReplaceAllString(v, ""), "") parentTweetsTokens = append(parentTweetsTokens, t.Tokenize(tweets[i])) } // 親から1品詞ずつ受け継ぐ parentPtr := 0 for i, ot := range originTweetTokens { if ot.Class == tokenizer.DUMMY { continue } for _, t := range parentTweetsTokens[parentPtr] { if t.Class != tokenizer.DUMMY && ot.Features()[0] == t.Features()[0] && ot.Features()[1] == t.Features()[1] { originTweetTokens[i] = t parentPtr++ break } } if parentPtr == len(parentTweetsTokens)-1 { parentPtr = 0 } } generatedTweet := "" for _, t := range originTweetTokens { if t.Class != tokenizer.DUMMY { generatedTweet += t.Surface } } if generatedTweet == "" { // 入っていない場合は「からの」で埋める generatedTweet = "からの" } return generatedTweet, nil }
// anony.go package anony import ( "github.com/ikawaha/kagome/tokenizer" "regexp" "unicode/utf8" ) var ( t = tokenizer.New() ) func init() { tokenizer.SysDic() } func Anony(text string, single bool) string { tokens := t.Tokenize(text) var rText string var IniCount int for j := 0; j < len(tokens); j++ { tk := tokens[j] ft := tk.Features() if len(ft) > 7 { if ft[2] == "人名" && ft[1] == "固有名詞" { if IniCount == 0 { rText += Word2initial(ft[7]) } else if IniCount == 1 && single == false { rText += "・" rText += Word2initial(ft[7]) }
// NewZenrizer returns Zenrizer instance func NewZenrizer() *Zenrizer { return &Zenrizer{ tokenizer: tokenizer.New(), } }
// Find returns sentences that text matches with rule(s). func Find(text string, rule []int) []string { if len(rule) == 0 { return nil } t := tokenizer.New() text = reIgnoreText.ReplaceAllString(text, "") tokens := t.Tokenize(text) pos := 0 r := make([]int, len(rule)) copy(r, rule) sentence := "" start := 0 ambigous := 0 ret := []string{} for i := 0; i < len(tokens); i++ { tok := tokens[i] c := tok.Features() if len(c) == 0 { continue } y := c[len(c)-1] if !reWord.MatchString(y) { if y == "、" { continue } pos = 0 ambigous = 0 sentence = "" copy(r, rule) continue } if r[pos] == rule[pos] && !isWord(c) { pos = 0 ambigous = 0 sentence = "" copy(r, rule) continue } ambigous += strings.Count(y, "ッ") + strings.Count(y, "ー") n := countChars(y) r[pos] -= n sentence += tok.Surface if r[pos] == 0 || r[pos]+ambigous == 0 { pos++ if pos >= len(r) { ret = append(ret, sentence) start = i + 1 pos = 0 ambigous = 0 sentence = "" copy(r, rule) continue } sentence += " " } else if r[pos] < 0 { i = start + 1 start++ pos = 0 ambigous = 0 sentence = "" copy(r, rule) } } return ret }
func NewKagomeMorphTokenizer() *KagomeMorphTokenizer { return &KagomeMorphTokenizer{ tok: tokenizer.New(), } }