Exemplo n.º 1
0
func NewKagomeMorphTokenizerWithUserDic(userdic tokenizer.UserDic) *KagomeMorphTokenizer {
	k := tokenizer.New()
	k.SetUserDic(userdic)
	return &KagomeMorphTokenizer{
		tok: k,
	}
}
Exemplo n.º 2
0
// Match return true when text matches with rule(s).
func Match(text string, rule []int) bool {
	t := tokenizer.New()
	text = reIgnoreText.ReplaceAllString(text, "")
	tokens := t.Tokenize(text)
	pos := 0
	r := make([]int, len(rule))
	copy(r, rule)

	for i := 0; i < len(tokens); i++ {
		tok := tokens[i]
		c := tok.Features()
		if len(c) == 0 {
			continue
		}
		y := c[len(c)-1]
		if !reWord.MatchString(y) {
			if y == "、" {
				continue
			}
			return false
		}
		if r[pos] == rule[pos] && !isWord(c) {
			return false
		}
		n := countChars(y)
		r[pos] -= n
		if r[pos] == 0 {
			pos++
			if pos == len(r) && i == len(tokens)-2 {
				return true
			}
		}
	}
	return false
}
Exemplo n.º 3
0
func main() {
	t := tokenizer.New()
	tokens := t.Tokenize("寿司が食べたい。") // t.Analyze("寿司が食べたい。", tokenizer.Normal)
	for _, token := range tokens {
		if token.Class == tokenizer.DUMMY {
			// BOS: Begin Of Sentence, EOS: End Of Sentence.
			fmt.Printf("%s\n", token.Surface)
			continue
		}
		features := strings.Join(token.Features(), ",")
		fmt.Printf("%s\t%v\n", token.Surface, features)
	}
}
Exemplo n.º 4
0
func tokenize(sen string) string {
	t := tokenizer.New()
	tokens := t.Tokenize(sen)
	var buf bytes.Buffer
	fmt.Fprintln(&buf, "```")
	for i := 1; i < len(tokens); i++ {
		if tokens[i].Class == tokenizer.DUMMY {
			fmt.Fprintf(&buf, "%s\n", tokens[i].Surface)
			continue
		}
		features := strings.Join(tokens[i].Features(), ",")
		fmt.Fprintf(&buf, "%s\t%v\n", tokens[i].Surface, features)
	}
	fmt.Fprintln(&buf, "```")
	return buf.String()
}
Exemplo n.º 5
0
Arquivo: cmd.go Projeto: ghyde/kagome
// command main
func command(opt *option) error {
	t := tokenizer.New()
	var out = os.Stdout
	if opt.output != "" {
		var err error
		out, err = os.OpenFile(opt.output, os.O_RDWR|os.O_TRUNC|os.O_CREATE, 0666)
		if err != nil {
			fmt.Fprintln(ErrorWriter, err)
			os.Exit(1)
		}
		defer out.Close()
	}
	var udic tokenizer.UserDic
	if opt.udic != "" {
		var err error
		udic, err = tokenizer.NewUserDic(opt.udic)
		if err != nil {
			return err
		}
		t.SetUserDic(udic)
	}
	if opt.udic != "" {
		if udic, err := tokenizer.NewUserDic(opt.udic); err != nil {
			fmt.Fprintln(ErrorWriter, err)
			os.Exit(1)
		} else {
			t.SetUserDic(udic)
		}
	}

	tokens := t.Dot(opt.input, out)
	if opt.verbose {
		for i, size := 1, len(tokens); i < size; i++ {
			tok := tokens[i]
			f := tok.Features()
			if tok.Class == tokenizer.DUMMY {
				fmt.Fprintf(ErrorWriter, "%s\n", tok.Surface)
			} else {

				fmt.Fprintf(ErrorWriter, "%s\t%v\n", tok.Surface, strings.Join(f, ","))
			}
		}
	}
	return nil
}
Exemplo n.º 6
0
Arquivo: cmd.go Projeto: ghyde/kagome
// command main
func command(opt *option) error {
	var udic tokenizer.UserDic
	if opt.udic != "" {
		var err error
		if udic, err = tokenizer.NewUserDic(opt.udic); err != nil {
			return err
		}
	}
	t := tokenizer.New()
	t.SetUserDic(udic)

	mux := http.NewServeMux()
	mux.Handle("/", &TokenizeDemoHandler{tokenizer: t})
	mux.Handle("/a", &TokenizeHandler{tokenizer: t})
	log.Fatal(http.ListenAndServe(opt.http, mux))

	return nil
}
Exemplo n.º 7
0
// Tokenize runs tokenize string and return its tokens.
func (m *MultilangTokenizer) Tokenize(s string) []string {
	switch m.lang {
	case language.Japanese:
		t := kagomeTokenizer.New()
		tokens := t.Tokenize(s)
		// tokens contains BOS and EOS as token.
		// so length is caliculated by despite of them.
		ret := make([]string, 0, len(tokens)-2)
		for _, token := range tokens {
			if token.Class != kagomeTokenizer.DUMMY {
				ret = append(ret, token.Surface)
			}
		}
		return ret
	default:
		// space separated language
		return strings.Split(s, " ")
	}
}
Exemplo n.º 8
0
// AnalizeTargets analize target strings
func (a *Analyzer) AnalizeTargets() error {

	// 分かち書き

	if len(a.targets) == 0 {
		return errors.New("There are no targets")
	}

	t := tokenizer.New()
	for _, v := range a.targets {
		tokens := t.Tokenize(v)
		for _, token := range tokens {
			if token.Class != tokenizer.DUMMY {
				a.analyzedTargets = append(a.analyzedTargets, token.Surface)
			}
		}
	}
	return nil
}
Exemplo n.º 9
0
func (ctype *countType) countWordJp() int {

	tkn := tokenizer.New()
	morphs := tkn.Tokenize(ctype.text)

	var wc = 0

	re1 := regexp.MustCompile("^[" + regexp.QuoteMeta("-[]{}()|") + ",.*+=_:;~!@#$%^&?`'/]+$")
	re2 := regexp.MustCompile("^[(){}「」『』[]、。]+$")

	for _, m := range morphs {

		word := m.Surface
		if word == "EOS" || word == "BOS" || re1.MatchString(word) || re2.MatchString(word) {
			continue
		}

		//fmt.Printf("%s %v\n", m.Surface, m.Features())
		f := m.Features()
		class := f[0]
		if strings.Contains(class, "名詞") ||
			strings.Contains(class, "形容詞") ||
			strings.Contains(class, "動詞") ||
			strings.Contains(class, "副詞") {
		} else {
			continue
		}

		if any(word, ctype.stopWords) {
			continue
		}

		if _, err := ctype.MapWordFreq[word]; err {
			ctype.MapWordFreq[word]++
		} else {
			ctype.MapWordFreq[word] = 1
		}

		wc++
	}

	return wc
}
Exemplo n.º 10
0
func (b Bot) Tokenize(m Message) {
	sen := m.TextBody()
	t := tokenizer.New()
	tokens := t.Tokenize(sen)
	var buf bytes.Buffer
	fmt.Fprintln(&buf, "```")
	for i := 1; i < len(tokens); i++ {
		if tokens[i].Class == tokenizer.DUMMY {
			fmt.Fprintf(&buf, "%s\n", tokens[i].Surface)
			continue
		}
		features := strings.Join(tokens[i].Features(), ",")
		fmt.Fprintf(&buf, "%s\t%v\n", tokens[i].Surface, features)
	}
	fmt.Fprintln(&buf, "```")
	m.Text = buf.String()
	if e := b.PostMessage(m); e != nil {
		log.Printf("tokenize, post error, %v", e)
	}
}
Exemplo n.º 11
0
Arquivo: main.go Projeto: sorami/gopun
func main() {

	app := cli.NewApp()
	app.Name = Name
	app.Version = Version
	app.Author = "Sorami Hisamoto"
	app.Email = ""
	app.Usage = "Create Golang pun in Japanese."

	app.Flags = GlobalFlags
	app.Commands = Commands
	app.CommandNotFound = CommandNotFound

	app.Action = func(c *cli.Context) {
		scanner := bufio.NewScanner(os.Stdin)
		t := tokenizer.New()

		for scanner.Scan() {
			line := scanner.Text()

			tokens := t.Tokenize(line)
			for _, token := range tokens {
				if token.Class == tokenizer.DUMMY {
					continue
				}
				yomi := token.Features()[7]
				if yomi == "ゴラン" {
					fmt.Printf("Golang")
				} else if yomi == "ゴ" {
					fmt.Printf("Go")
				} else {
					fmt.Printf("%s", token.Surface)
				}
			}
			fmt.Printf("\n")
		}
	}

	app.Run(os.Args)
}
Exemplo n.º 12
0
func main() {
	var mongodbCredential string = os.Getenv("KINDLIZED_MONGODB")
	session, _ := mgo.Dial(mongodbCredential)
	defer session.Close()

	database := session.DB("kindlized")

	booksCollection := database.C("books")
	query := booksCollection.Find(bson.M{})

	b := new(Book)
	query.One(&b)

	title := b.Title[0]

	t := tokenizer.New()
	tokens := t.Tokenize(title)
	for _, token := range tokens {
		if token.Class == tokenizer.KNOWN {
			log.Println(token.Surface)
			newWord := &Word{
				ID:        bson.NewObjectId(),
				Word:      token.Surface,
				LastQuery: time.Now(),
			}

			q := database.C("words").Find(bson.M{
				"word": token.Surface,
			})
			count, _ := q.Count()
			if count == 0 {
				database.C("words").Insert(newWord)
			}
		}
	}
	// var books []Book
	// query.All(&books)
	// log.Println(books)
}
Exemplo n.º 13
0
Arquivo: ma.go Projeto: lanevok/myPro
/*
 * textを形態素解析をし、数を除く名詞のみ出現回数を数え
 * map[単語 string]回数int で返します。
 * ※ map に格納される語は出現2回以上で、一部ストップワード除去。
 */
func GetNounMapForMA(text string) map[string]int {
	res := make(map[string]int)

	t := tokenizer.New()
	tokens := t.Tokenize(text)
	for _, token := range tokens {
		features := strings.Join(token.Features(), ",")

		if strings.HasPrefix(features, "名詞") &&
			!strings.Contains(features, "数") {

			w := fmt.Sprintf("%s", token.Surface)
			if res[w] == 0 {
				res[w] = 1
			} else {
				res[w] += 1
			}

		}
	}

	stopWord := []string{"http", "htm", "com", "co", "jp",
		","://","/","&#",",",".","-","www",
		"&", ";", ":", ":&", ",'", "(", ")", "+", ";&", ";-"}

	set := mapset.NewSet()
	for _, v := range stopWord {
		set.Add(v)
	}

	for key, value := range res {
		if value == 1 || set.Contains(key) {
			delete(res, key)
		}
	}

	return res
}
Exemplo n.º 14
0
//テキストからsentenceオブジェクトを作る。
func getSentences(text string) []sentence {
	var sentences []sentence
	t := tokenizer.New()

	text = strings.Replace(text, "。", "\n", -1)
	text = strings.Replace(text, ".", "\n", -1)
	text = strings.Replace(text, "?", "?\n", -1)
	text = strings.Replace(text, "!", "!\n", -1)
	text = strings.Replace(text, "?", "?\n", -1)
	text = strings.Replace(text, "!", "!\n", -1)
	senstr := strings.Split(text, "\n")

	for i := 0; i < len(senstr); i++ {
		tokens := t.Tokenize(senstr[i])
		var words []word
		var kana string
		for j := 0; j < len(tokens); j++ {
			tk := tokens[j]
			ft := tk.Features()
			if len(ft) > 7 {
				w := word{str: ft[6],
					kana:  ft[7],
					wtype: ft[0],
				}
				words = append(words, w)
				kana += ft[7]
			}
		}
		sentences = append(sentences,
			sentence{
				str:   senstr[i],
				words: words,
				kana:  kana,
			})
	}
	return sentences
}
Exemplo n.º 15
0
Arquivo: field.go Projeto: Rompei/lgb
// CrossParents cross parent cell's tweet
func (f *Field) CrossParents(x, y int, newTweet string) (string, error) {

	// 親を交配する

	if x == 0 || x == f.SizeX-1 || y == 0 || y == f.SizeY-1 {
		return "", errors.New("Invalid cell.")
	}

	var tweets []string
	if f.Points[y-1][x-1].IsAlive {
		tweets = append(tweets, f.Points[y-1][x-1].Str)
	}
	if f.Points[y-1][x].IsAlive {
		tweets = append(tweets, f.Points[y-1][x].Str)
	}
	if f.Points[y-1][x+1].IsAlive {
		tweets = append(tweets, f.Points[y-1][x+1].Str)
	}
	if f.Points[y][x-1].IsAlive {
		tweets = append(tweets, f.Points[y][x-1].Str)
	}
	if f.Points[y][x+1].IsAlive {
		tweets = append(tweets, f.Points[y][x+1].Str)
	}
	if f.Points[y+1][x-1].IsAlive {
		tweets = append(tweets, f.Points[y+1][x-1].Str)
	}
	if f.Points[y+1][x].IsAlive {
		tweets = append(tweets, f.Points[y+1][x].Str)
	}
	if f.Points[y+1][x+1].IsAlive {
		tweets = append(tweets, f.Points[y+1][x+1].Str)
	}

	re1, err := regexp.Compile(`(^|\s)(@|https?://)\S+`)
	if err != nil {
		return "", err
	}
	re2, err := regexp.Compile(`^\s*|\s*$`)
	if err != nil {
		return "", err
	}

	t := tokenizer.New()
	newTweet = re2.ReplaceAllString(re1.ReplaceAllString(newTweet, ""), "")
	originTweetTokens := t.Tokenize(newTweet)
	var parentTweetsTokens [][]tokenizer.Token
	for i, v := range tweets {
		tweets[i] = re2.ReplaceAllString(re1.ReplaceAllString(v, ""), "")
		parentTweetsTokens = append(parentTweetsTokens, t.Tokenize(tweets[i]))
	}

	// 親から1品詞ずつ受け継ぐ
	parentPtr := 0
	for i, ot := range originTweetTokens {
		if ot.Class == tokenizer.DUMMY {
			continue
		}
		for _, t := range parentTweetsTokens[parentPtr] {
			if t.Class != tokenizer.DUMMY && ot.Features()[0] == t.Features()[0] && ot.Features()[1] == t.Features()[1] {
				originTweetTokens[i] = t
				parentPtr++
				break
			}
		}
		if parentPtr == len(parentTweetsTokens)-1 {
			parentPtr = 0
		}
	}

	generatedTweet := ""
	for _, t := range originTweetTokens {
		if t.Class != tokenizer.DUMMY {
			generatedTweet += t.Surface
		}
	}
	if generatedTweet == "" {
		// 入っていない場合は「からの」で埋める
		generatedTweet = "からの"
	}

	return generatedTweet, nil
}
Exemplo n.º 16
0
// anony.go
package anony

import (
	"github.com/ikawaha/kagome/tokenizer"
	"regexp"
	"unicode/utf8"
)

var (
	t = tokenizer.New()
)

func init() {
	tokenizer.SysDic()
}
func Anony(text string, single bool) string {
	tokens := t.Tokenize(text)
	var rText string
	var IniCount int
	for j := 0; j < len(tokens); j++ {
		tk := tokens[j]
		ft := tk.Features()
		if len(ft) > 7 {
			if ft[2] == "人名" && ft[1] == "固有名詞" {
				if IniCount == 0 {
					rText += Word2initial(ft[7])
				} else if IniCount == 1 && single == false {
					rText += "・"
					rText += Word2initial(ft[7])
				}
Exemplo n.º 17
0
// NewZenrizer returns Zenrizer instance
func NewZenrizer() *Zenrizer {
	return &Zenrizer{
		tokenizer: tokenizer.New(),
	}
}
Exemplo n.º 18
0
// Find returns sentences that text matches with rule(s).
func Find(text string, rule []int) []string {
	if len(rule) == 0 {
		return nil
	}
	t := tokenizer.New()
	text = reIgnoreText.ReplaceAllString(text, "")
	tokens := t.Tokenize(text)
	pos := 0
	r := make([]int, len(rule))
	copy(r, rule)
	sentence := ""
	start := 0
	ambigous := 0

	ret := []string{}
	for i := 0; i < len(tokens); i++ {
		tok := tokens[i]
		c := tok.Features()
		if len(c) == 0 {
			continue
		}
		y := c[len(c)-1]
		if !reWord.MatchString(y) {
			if y == "、" {
				continue
			}
			pos = 0
			ambigous = 0
			sentence = ""
			copy(r, rule)
			continue
		}
		if r[pos] == rule[pos] && !isWord(c) {
			pos = 0
			ambigous = 0
			sentence = ""
			copy(r, rule)
			continue
		}
		ambigous += strings.Count(y, "ッ") + strings.Count(y, "ー")
		n := countChars(y)
		r[pos] -= n
		sentence += tok.Surface
		if r[pos] == 0 || r[pos]+ambigous == 0 {
			pos++
			if pos >= len(r) {
				ret = append(ret, sentence)
				start = i + 1
				pos = 0
				ambigous = 0
				sentence = ""
				copy(r, rule)
				continue
			}
			sentence += " "
		} else if r[pos] < 0 {
			i = start + 1
			start++
			pos = 0
			ambigous = 0
			sentence = ""
			copy(r, rule)
		}
	}
	return ret
}
Exemplo n.º 19
0
func NewKagomeMorphTokenizer() *KagomeMorphTokenizer {
	return &KagomeMorphTokenizer{
		tok: tokenizer.New(),
	}
}