func NewSearchServer(corpusFile, sgmtDictFile string) *SearchServer { sgmt := new(sego.Segmenter) sgmt.LoadDictionary(sgmtDictFile) idx := weakand.NewIndexFromFile(corpusFile, sgmt, "") // no index dump return &SearchServer{index: idx, sgmt: sgmt} }
func guaranteeSegmenter(sgmt **sego.Segmenter) { if *sgmt == nil { s := new(sego.Segmenter) s.LoadDictionary(path.Join(gosrc(), "github.com/huichen/sego/data/dictionary.txt")) *sgmt = s } }
func main() { flag.Parse() var seg sego.Segmenter seg.LoadDictionary("../data/dictionary.txt") segments := seg.Segment([]byte(*text)) fmt.Println(sego.SegmentsToString(segments, true)) }
func tokenize(doc string, sgmt *sego.Segmenter) []string { var terms []string for _, seg := range sgmt.Segment([]byte(doc)) { term := seg.Token().Text() if !AllPunctOrSpace(term) { terms = append(terms, term) } } return terms }
func Setup(t *telegram.Telegram, config map[string]interface{}, modules map[string]bool, cmds *types.CommandMap) types.Command { if val, ok := modules["chinese"]; !ok || val { var s sego.Segmenter s.LoadDictionary(config["dict"].(string)) c := &Chinese{ tg: t, redis: redis.NewClient(&redis.Options{ Addr: config["redis"].(string), Password: "", DB: int64(config["redis_db"].(float64)), }), seg: s, } if config["debug"] != nil { c.debug = config["debug"].(bool) } (*cmds)["learn"] = types.Command{ Name: "learn", Args: "<expr>", ArgNum: -1, Desc: "Learn a Chinese expression", Processor: c, } (*cmds)["speak"] = types.Command{ Name: "speak", ArgNum: 0, Desc: "Speak a Chinese sentence based on previously learned data", Processor: c, } (*cmds)["answer"] = types.Command{ Name: "answer", Args: "[question]", ArgNum: -1, Desc: "Answer to [question]. If no [question] provided, answer to the message you reply to.", Processor: c, } pong, err := c.redis.Ping().Result() if (err != nil) || (pong != "PONG") { panic(errors.New("Cannot PING redis")) } return types.Command{ Name: "chn", Processor: c, } } return types.Command{} }
func main() { // 载入词典 var segmenter sego.Segmenter segmenter.LoadDictionary("/home/bryce/code/gocode/src/github.com/huichen/sego/data/dictionary.txt") // 分词 text := []byte("中华人民共和国中央人民政府") segments := segmenter.Segment(text) // 处理分词结果 // 支持普通模式和搜索模式两种分词,见代码中SegmentsToString函数的注释。 fmt.Println(sego.SegmentsToString(segments, true)) }
func main() { var seg sego.Segmenter seg.LoadDictionary("data/dictionary.txt") text1 := "我喜欢看电视,也喜欢接电话" text2 := "我不喜欢看电视,也不喜欢看电影" type num struct { V, W int } var words = make(map[string]*num, 10) segments1 := seg.Segment([]byte(text1)) segments2 := seg.Segment([]byte(text2)) for _, v := range sego.SegmentsToSlice(segments1, true) { _, ok := words[v] if ok { p := words[v] p.V = p.V + 1 words[v] = p } else { words[v] = &num{1, 0} } } for _, v := range sego.SegmentsToSlice(segments2, true) { _, ok := words[v] if ok { p := words[v] p.W = p.W + 1 words[v] = p } else { words[v] = &num{0, 1} } } //余弦相似度计算 var a, b, c int for k, v := range words { a = a + v.V*v.W b = b + v.V*v.V c = c + v.W*v.W fmt.Println(k, v) } fmt.Println(a, b, c, float64(a)/(math.Sqrt(float64(b))*math.Sqrt(float64(c)))) }
func main() { // 确保单线程,因为Go从1.5开始默认多线程 runtime.GOMAXPROCS(1) // 解析命令行参数 flag.Parse() // 记录时间 t0 := time.Now() var segmenter sego.Segmenter segmenter.LoadDictionary("../data/dictionary.txt") // 记录时间 t1 := time.Now() log.Printf("载入词典花费时间 %v", t1.Sub(t0)) // 写入内存profile文件 if *memprofile != "" { f, err := os.Create(*memprofile) if err != nil { log.Fatal(err) } pprof.WriteHeapProfile(f) defer f.Close() } // 打开将要分词的文件 file, err := os.Open("../testdata/bailuyuan.txt") if err != nil { log.Fatal(err) } defer file.Close() // 逐行读入 scanner := bufio.NewScanner(file) size := 0 lines := [][]byte{} for scanner.Scan() { var text string fmt.Sscanf(scanner.Text(), "%s", &text) content := []byte(text) size += len(content) lines = append(lines, content) } // 当指定输出文件时打开输出文件 var of *os.File if *output != "" { of, err = os.Create(*output) if err != nil { log.Fatal(err) } defer of.Close() } // 记录时间 t2 := time.Now() // 打开处理器profile文件 if *cpuprofile != "" { f, err := os.Create(*cpuprofile) if err != nil { log.Fatal(err) } pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } // 分词 for i := 0; i < numRuns; i++ { for _, l := range lines { segments := segmenter.Segment(l) if *output != "" { of.WriteString(sego.SegmentsToString(segments, false)) of.WriteString("\n") } } } // 停止处理器profile if *cpuprofile != "" { defer pprof.StopCPUProfile() } // 记录时间并计算分词速度 t3 := time.Now() log.Printf("分词花费时间 %v", t3.Sub(t2)) log.Printf("分词速度 %f MB/s", float64(size*numRuns)/t3.Sub(t2).Seconds()/(1024*1024)) }