Beispiel #1
0
// Find returns sentences that text matches with rule(s).
func Find(text string, rule []int) []string {
	if len(rule) == 0 {
		return nil
	}
	t := kagome.NewTokenizer()
	text = reIgnoreText.ReplaceAllString(text, "")
	tokens := t.Tokenize(text)
	pos := 0
	r := make([]int, len(rule))
	copy(r, rule)
	sentence := ""
	start := 0

	ret := []string{}
	for i := 0; i < len(tokens); i++ {
		tok := tokens[i]
		c := tok.Features()
		if len(c) == 0 {
			continue
		}
		y := c[len(c)-1]
		if !reWord.MatchString(y) {
			if y == "、" {
				continue
			}
			pos = 0
			sentence = ""
			copy(r, rule)
			continue
		}
		if r[pos] == rule[pos] && !isWord(c) {
			pos = 0
			sentence = ""
			copy(r, rule)
			continue
		}
		n := countChars(y)
		r[pos] -= n
		sentence += tok.Surface
		if r[pos] == 0 {
			pos++
			if pos >= len(r) {
				ret = append(ret, sentence)
				start = i + 1
				pos = 0
				sentence = ""
				copy(r, rule)
				continue
			}
			sentence += " "
		} else if r[pos] < 0 {
			i = start + 1
			start++
			pos = 0
			sentence = ""
			copy(r, rule)
		}
	}
	return ret
}
Beispiel #2
0
func NewKagomeMorphTokenizerWithUserDic(userdic *kagome.UserDic) *KagomeMorphTokenizer {
	k := kagome.NewTokenizer()
	k.SetUserDic(userdic)
	return &KagomeMorphTokenizer{
		tok: k,
	}
}
Beispiel #3
0
// Match return true when text matches with rule(s).
func Match(text string, rule []int) bool {
	t := kagome.NewTokenizer()
	text = reIgnoreText.ReplaceAllString(text, "")
	tokens := t.Tokenize(text)
	pos := 0
	r := make([]int, len(rule))
	copy(r, rule)

	for i := 0; i < len(tokens); i++ {
		tok := tokens[i]
		c := tok.Features()
		if len(c) == 0 {
			continue
		}
		y := c[len(c)-1]
		if !reWord.MatchString(y) {
			if y == "、" {
				continue
			}
			return false
		}
		if r[pos] == rule[pos] && !isWord(c) {
			return false
		}
		n := countChars(y)
		r[pos] -= n
		if r[pos] == 0 {
			pos++
			if pos == len(r) && i == len(tokens)-2 {
				return true
			}
		}
	}
	return false
}
Beispiel #4
0
func whereIsTweeted(t *anaconda.Tweet) (pid PrefId) {
	pid = PrefInvalid
	tokenizer := kagome.NewTokenizer()
	tokens := tokenizer.Tokenize(t.User.Description + " " + t.User.Location + " " + t.Text)
	for _, m := range tokens {
		if m.Id == kagome.BosEosId {
			continue
		}
		features := m.Features()
		if features[0] != "名詞" || features[1] != "固有名詞" {
			continue
		}
		pid = PrefDict[m.Surface]
		if pid != PrefInvalid {
			return
		}
		//fmt.Printf("%s features:%s\n", m, m.Features())
	}
	return
}
Beispiel #5
0
func Main(input string) {
	if input == "" {
		usage()
	}
	var out = os.Stdout
	if *fOutputFile != "" {
		var err error
		out, err = os.OpenFile(*fOutputFile, os.O_RDWR|os.O_TRUNC|os.O_CREATE, 0666)
		if err != nil {
			fmt.Fprintln(os.Stderr, err)
			os.Exit(1)
		}
		defer out.Close()
	}

	t := kagome.NewTokenizer()
	if *fUserDicFile != "" {
		if udic, err := kagome.NewUserDic(*fUserDicFile); err != nil {
			fmt.Fprintln(os.Stderr, err)
			os.Exit(1)
		} else {
			t.SetUserDic(udic)
		}
	}

	tokens := t.Dot(input, out)
	if *fVerbose {
		for i, size := 1, len(tokens); i < size; i++ {
			tok := tokens[i]
			f := tok.Features()
			if tok.Class == kagome.DUMMY {
				fmt.Fprintf(os.Stderr, "%s\n", tok.Surface)
			} else {

				fmt.Fprintf(os.Stderr, "%s\t%v\n", tok.Surface, strings.Join(f, ","))
			}
		}
	}
}
Beispiel #6
0
func NewKagomeMorphTokenizer() *KagomeMorphTokenizer {
	return &KagomeMorphTokenizer{
		tok: kagome.NewTokenizer(),
	}
}
Beispiel #7
0
func Main() {
	if *fHttp != "" && *fInputFile != "" {
		usage()
	}

	var udic *kagome.UserDic
	if *fUserDicFile != "" {
		var err error
		udic, err = kagome.NewUserDic(*fUserDicFile)
		if err != nil {
			fmt.Fprintln(os.Stderr, err)
			os.Exit(1)
		}
	}

	if *fHttp != "" {
		t := kagome.NewTokenizer()
		if udic != nil {
			t.SetUserDic(udic)
		}
		hTok := &KagomeHandler{tokenizer: t}
		hDem := &KagomeDemoHandler{tokenizer: t}
		mux := http.NewServeMux()
		mux.Handle("/", hTok)
		mux.Handle("/_demo", hDem)
		log.Fatal(http.ListenAndServe(*fHttp, mux))
		os.Exit(0)
	}

	var inputFile = os.Stdin
	if *fInputFile != "" {
		var err error
		inputFile, err = os.Open(*fInputFile)
		if err != nil {
			fmt.Fprintln(os.Stderr, err)
			os.Exit(1)
		}
		defer inputFile.Close()
	}

	t := kagome.NewTokenizer()
	if udic != nil {
		t.SetUserDic(udic)
	}

	var tokenize = t.Tokenize
	switch {
	case *fTokenizeMode == "normal":
		break
	case *fTokenizeMode == "search":
		tokenize = t.SearchModeTokenize
	case *fTokenizeMode == "extended":
		tokenize = t.ExtendedModeTokenize
	case *fTokenizeMode != "":
		fmt.Fprintf(os.Stderr, "invalid argument: -mode %v\n", *fTokenizeMode)
		usage()
	}

	scanner := bufio.NewScanner(inputFile)
	for scanner.Scan() {
		line := scanner.Text()
		tokens := tokenize(line)
		for i, size := 1, len(tokens); i < size; i++ {
			tok := tokens[i]
			c := tok.Features()
			if tok.Class == kagome.DUMMY {
				fmt.Printf("%s\n", tok.Surface)
			} else {
				fmt.Printf("%s\t%v\n", tok.Surface, strings.Join(c, ","))
			}
		}
	}
	if err := scanner.Err(); err != nil {
		log.Println(err)
	}
}