示例#1
0
func (this *SegoController) ToSlices(txt string, searchMode bool) []string {
	text := []byte(txt)
	segments := segmenter.Segment(text)

	// 分词
	result := sego.SegmentsToSlice(segments, searchMode)

	// 去重
	in := map[string]string{}

	for k, _ := range result {
		if _, ok := in[result[k]]; !ok { //不存在值才加入

			// 只允许中文、字母
			if matched, _ := regexp.MatchString("[a-z\u4e00-\u9fa5]", result[k]); !matched {
				continue
			}

			in[result[k]] = strings.TrimSpace(result[k])
		}
	}

	// 生成新数组
	var newResult []string

	for k, _ := range in {
		newResult = append(newResult, in[k])

	}

	return newResult
}
示例#2
0
func (this *Segmenter) Segment(content string, search_mode bool) []string {

	text := []byte(content)
	segments := this.segmenter.Segment(text)
	res := sego.SegmentsToSlice(segments, search_mode)
	//fmt.Println("SEGMENT::: ",res)
	return res
}
示例#3
0
文件: example.go 项目: widaT/sego
func main() {
	var seg sego.Segmenter
	seg.LoadDictionary("data/dictionary.txt")
	text1 := "我喜欢看电视,也喜欢接电话"
	text2 := "我不喜欢看电视,也不喜欢看电影"

	type num struct {
		V, W int
	}
	var words = make(map[string]*num, 10)
	segments1 := seg.Segment([]byte(text1))
	segments2 := seg.Segment([]byte(text2))

	for _, v := range sego.SegmentsToSlice(segments1, true) {
		_, ok := words[v]
		if ok {
			p := words[v]
			p.V = p.V + 1
			words[v] = p
		} else {
			words[v] = &num{1, 0}
		}
	}
	for _, v := range sego.SegmentsToSlice(segments2, true) {
		_, ok := words[v]
		if ok {
			p := words[v]
			p.W = p.W + 1
			words[v] = p
		} else {
			words[v] = &num{0, 1}
		}
	}

	//余弦相似度计算
	var a, b, c int
	for k, v := range words {
		a = a + v.V*v.W
		b = b + v.V*v.V
		c = c + v.W*v.W

		fmt.Println(k, v)
	}
	fmt.Println(a, b, c, float64(a)/(math.Sqrt(float64(b))*math.Sqrt(float64(c))))
}
示例#4
0
func (s *server) Filter(ctx context.Context, in *pb.WordFilterRequest) (*pb.WordFilterResponse, error) {
	segments := s.segmenter.Segment([]byte(in.Text))
	cleanText := in.Text
	words := sego.SegmentsToSlice(segments, false)
	for k := range words {
		if s.dirtyWords[strings.ToUpper(words[k])] {
			reg, _ := regexp.Compile("(?i:" + regexp.QuoteMeta(words[k]) + ")")
			replacement := strings.Repeat("▇", utf8.RuneCountInString(words[k]))
			cleanText = reg.ReplaceAllLiteralString(cleanText, replacement)
		}
	}
	return &pb.WordFilterResponse{cleanText}, nil
}
示例#5
0
文件: tokenizer.go 项目: Leon2012/xs
func (x *XSTokenizerSego) GetTokens(value string, doc *XSDocument) []string {
	segments := x.segmenter.Segment([]byte(value))
	return sego.SegmentsToSlice(segments, false)
}