func Test(t *testing.T) { var input string = "Hello,World!你好,世界!14.4%,abc1233,11a-b,2015.2.1" tokenizer := New() for iter := tokenizer.Traverse(cwsharp.NewStringReader(input)); iter.Next(); { fmt.Println(iter.Cur()) } }
func Test(t *testing.T) { input := "一次性交100元" tokenizer := New() iter := tokenizer.Traverse(cwsharp.NewStringReader(input)) for iter.Next() { fmt.Println(iter.Cur()) } }
func Test1(t *testing.T) { input := "研究生命起源.一次性交100元" tokenizer := New("https://github.com/zhengchun/cwsharp-go/raw/master/data/cwsharp.dawg") iter := tokenizer.Traverse(cwsharp.NewStringReader(input)) for iter.Next() { fmt.Println(iter.Cur()) } }
//标准分词测试 func TestStandard(dawgFile string, text string) { fmt.Println("测试:" + text) tokenizer := mmseg.New(dawgFile) for iter := tokenizer.Traverse(cwsharp.NewStringReader(text)); iter.Next(); { fmt.Print(iter.Cur()) fmt.Print(" / ") } fmt.Println() }
func main() { file := "data//cwsharp.dawg" tokenizer := cwsharp.NewStopwordFilter(mmseg.New(file)) w := map[string]bool{"the": true} tokenizer.CheckIgnore = func(t cwsharp.Token) bool { _, ok := w[t.Text()] if t.Kind() == cwsharp.PUNCT || ok { return true } return false } for _, text := range []string{"长春市长春药店", "The quick brown fox jumps over the lazy dog"} { for iter := tokenizer.Traverse(cwsharp.NewStringReader(text)); iter.Next(); { fmt.Print(iter.Cur()) fmt.Print(" / ") } fmt.Println() } }