func (this *SegoController) ToSlices(txt string, searchMode bool) []string { text := []byte(txt) segments := segmenter.Segment(text) // 分词 result := sego.SegmentsToSlice(segments, searchMode) // 去重 in := map[string]string{} for k, _ := range result { if _, ok := in[result[k]]; !ok { //不存在值才加入 // 只允许中文、字母 if matched, _ := regexp.MatchString("[a-z\u4e00-\u9fa5]", result[k]); !matched { continue } in[result[k]] = strings.TrimSpace(result[k]) } } // 生成新数组 var newResult []string for k, _ := range in { newResult = append(newResult, in[k]) } return newResult }
func (this *Segmenter) Segment(content string, search_mode bool) []string { text := []byte(content) segments := this.segmenter.Segment(text) res := sego.SegmentsToSlice(segments, search_mode) //fmt.Println("SEGMENT::: ",res) return res }
func main() { var seg sego.Segmenter seg.LoadDictionary("data/dictionary.txt") text1 := "我喜欢看电视,也喜欢接电话" text2 := "我不喜欢看电视,也不喜欢看电影" type num struct { V, W int } var words = make(map[string]*num, 10) segments1 := seg.Segment([]byte(text1)) segments2 := seg.Segment([]byte(text2)) for _, v := range sego.SegmentsToSlice(segments1, true) { _, ok := words[v] if ok { p := words[v] p.V = p.V + 1 words[v] = p } else { words[v] = &num{1, 0} } } for _, v := range sego.SegmentsToSlice(segments2, true) { _, ok := words[v] if ok { p := words[v] p.W = p.W + 1 words[v] = p } else { words[v] = &num{0, 1} } } //余弦相似度计算 var a, b, c int for k, v := range words { a = a + v.V*v.W b = b + v.V*v.V c = c + v.W*v.W fmt.Println(k, v) } fmt.Println(a, b, c, float64(a)/(math.Sqrt(float64(b))*math.Sqrt(float64(c)))) }
func (s *server) Filter(ctx context.Context, in *pb.WordFilterRequest) (*pb.WordFilterResponse, error) { segments := s.segmenter.Segment([]byte(in.Text)) cleanText := in.Text words := sego.SegmentsToSlice(segments, false) for k := range words { if s.dirtyWords[strings.ToUpper(words[k])] { reg, _ := regexp.Compile("(?i:" + regexp.QuoteMeta(words[k]) + ")") replacement := strings.Repeat("▇", utf8.RuneCountInString(words[k])) cleanText = reg.ReplaceAllLiteralString(cleanText, replacement) } } return &pb.WordFilterResponse{cleanText}, nil }
func (x *XSTokenizerSego) GetTokens(value string, doc *XSDocument) []string { segments := x.segmenter.Segment([]byte(value)) return sego.SegmentsToSlice(segments, false) }