// Find returns sentences that text matches with rule(s). func Find(text string, rule []int) []string { if len(rule) == 0 { return nil } t := kagome.NewTokenizer() text = reIgnoreText.ReplaceAllString(text, "") tokens := t.Tokenize(text) pos := 0 r := make([]int, len(rule)) copy(r, rule) sentence := "" start := 0 ret := []string{} for i := 0; i < len(tokens); i++ { tok := tokens[i] c := tok.Features() if len(c) == 0 { continue } y := c[len(c)-1] if !reWord.MatchString(y) { if y == "、" { continue } pos = 0 sentence = "" copy(r, rule) continue } if r[pos] == rule[pos] && !isWord(c) { pos = 0 sentence = "" copy(r, rule) continue } n := countChars(y) r[pos] -= n sentence += tok.Surface if r[pos] == 0 { pos++ if pos >= len(r) { ret = append(ret, sentence) start = i + 1 pos = 0 sentence = "" copy(r, rule) continue } sentence += " " } else if r[pos] < 0 { i = start + 1 start++ pos = 0 sentence = "" copy(r, rule) } } return ret }
func NewKagomeMorphTokenizerWithUserDic(userdic *kagome.UserDic) *KagomeMorphTokenizer { k := kagome.NewTokenizer() k.SetUserDic(userdic) return &KagomeMorphTokenizer{ tok: k, } }
// Match return true when text matches with rule(s). func Match(text string, rule []int) bool { t := kagome.NewTokenizer() text = reIgnoreText.ReplaceAllString(text, "") tokens := t.Tokenize(text) pos := 0 r := make([]int, len(rule)) copy(r, rule) for i := 0; i < len(tokens); i++ { tok := tokens[i] c := tok.Features() if len(c) == 0 { continue } y := c[len(c)-1] if !reWord.MatchString(y) { if y == "、" { continue } return false } if r[pos] == rule[pos] && !isWord(c) { return false } n := countChars(y) r[pos] -= n if r[pos] == 0 { pos++ if pos == len(r) && i == len(tokens)-2 { return true } } } return false }
func whereIsTweeted(t *anaconda.Tweet) (pid PrefId) { pid = PrefInvalid tokenizer := kagome.NewTokenizer() tokens := tokenizer.Tokenize(t.User.Description + " " + t.User.Location + " " + t.Text) for _, m := range tokens { if m.Id == kagome.BosEosId { continue } features := m.Features() if features[0] != "名詞" || features[1] != "固有名詞" { continue } pid = PrefDict[m.Surface] if pid != PrefInvalid { return } //fmt.Printf("%s features:%s\n", m, m.Features()) } return }
func Main(input string) { if input == "" { usage() } var out = os.Stdout if *fOutputFile != "" { var err error out, err = os.OpenFile(*fOutputFile, os.O_RDWR|os.O_TRUNC|os.O_CREATE, 0666) if err != nil { fmt.Fprintln(os.Stderr, err) os.Exit(1) } defer out.Close() } t := kagome.NewTokenizer() if *fUserDicFile != "" { if udic, err := kagome.NewUserDic(*fUserDicFile); err != nil { fmt.Fprintln(os.Stderr, err) os.Exit(1) } else { t.SetUserDic(udic) } } tokens := t.Dot(input, out) if *fVerbose { for i, size := 1, len(tokens); i < size; i++ { tok := tokens[i] f := tok.Features() if tok.Class == kagome.DUMMY { fmt.Fprintf(os.Stderr, "%s\n", tok.Surface) } else { fmt.Fprintf(os.Stderr, "%s\t%v\n", tok.Surface, strings.Join(f, ",")) } } } }
func NewKagomeMorphTokenizer() *KagomeMorphTokenizer { return &KagomeMorphTokenizer{ tok: kagome.NewTokenizer(), } }
func Main() { if *fHttp != "" && *fInputFile != "" { usage() } var udic *kagome.UserDic if *fUserDicFile != "" { var err error udic, err = kagome.NewUserDic(*fUserDicFile) if err != nil { fmt.Fprintln(os.Stderr, err) os.Exit(1) } } if *fHttp != "" { t := kagome.NewTokenizer() if udic != nil { t.SetUserDic(udic) } hTok := &KagomeHandler{tokenizer: t} hDem := &KagomeDemoHandler{tokenizer: t} mux := http.NewServeMux() mux.Handle("/", hTok) mux.Handle("/_demo", hDem) log.Fatal(http.ListenAndServe(*fHttp, mux)) os.Exit(0) } var inputFile = os.Stdin if *fInputFile != "" { var err error inputFile, err = os.Open(*fInputFile) if err != nil { fmt.Fprintln(os.Stderr, err) os.Exit(1) } defer inputFile.Close() } t := kagome.NewTokenizer() if udic != nil { t.SetUserDic(udic) } var tokenize = t.Tokenize switch { case *fTokenizeMode == "normal": break case *fTokenizeMode == "search": tokenize = t.SearchModeTokenize case *fTokenizeMode == "extended": tokenize = t.ExtendedModeTokenize case *fTokenizeMode != "": fmt.Fprintf(os.Stderr, "invalid argument: -mode %v\n", *fTokenizeMode) usage() } scanner := bufio.NewScanner(inputFile) for scanner.Scan() { line := scanner.Text() tokens := tokenize(line) for i, size := 1, len(tokens); i < size; i++ { tok := tokens[i] c := tok.Features() if tok.Class == kagome.DUMMY { fmt.Printf("%s\n", tok.Surface) } else { fmt.Printf("%s\t%v\n", tok.Surface, strings.Join(c, ",")) } } } if err := scanner.Err(); err != nil { log.Println(err) } }