Beispiel #1
0
func Test(t *testing.T) {
	var input string = "Hello,World!你好,世界!14.4%,abc1233,11a-b,2015.2.1"
	tokenizer := New()
	for iter := tokenizer.Traverse(cwsharp.NewStringReader(input)); iter.Next(); {
		fmt.Println(iter.Cur())
	}
}
Beispiel #2
0
func Test(t *testing.T) {
	input := "一次性交100元"
	tokenizer := New()
	iter := tokenizer.Traverse(cwsharp.NewStringReader(input))
	for iter.Next() {
		fmt.Println(iter.Cur())
	}
}
Beispiel #3
0
func Test1(t *testing.T) {
	input := "研究生命起源.一次性交100元"
	tokenizer := New("https://github.com/zhengchun/cwsharp-go/raw/master/data/cwsharp.dawg")
	iter := tokenizer.Traverse(cwsharp.NewStringReader(input))
	for iter.Next() {
		fmt.Println(iter.Cur())
	}
}
Beispiel #4
0
//标准分词测试
func TestStandard(dawgFile string, text string) {
	fmt.Println("测试:" + text)
	tokenizer := mmseg.New(dawgFile)
	for iter := tokenizer.Traverse(cwsharp.NewStringReader(text)); iter.Next(); {
		fmt.Print(iter.Cur())
		fmt.Print(" / ")
	}
	fmt.Println()
}
Beispiel #5
0
func main() {
	file := "data//cwsharp.dawg"
	tokenizer := cwsharp.NewStopwordFilter(mmseg.New(file))
	w := map[string]bool{"the": true}
	tokenizer.CheckIgnore = func(t cwsharp.Token) bool {
		_, ok := w[t.Text()]
		if t.Kind() == cwsharp.PUNCT || ok {
			return true
		}
		return false
	}
	for _, text := range []string{"长春市长春药店", "The quick brown fox jumps over the lazy dog"} {
		for iter := tokenizer.Traverse(cwsharp.NewStringReader(text)); iter.Next(); {
			fmt.Print(iter.Cur())
			fmt.Print(" / ")
		}
		fmt.Println()
	}
}