func Test_SequentialDataLoader(t *testing.T) { test_file := "test_instances.dat" num_feature_classes := 6 test_cases := []DataInstance{ { []int{0, 1, 2, 3, 4, 5}, 1, 2, }, { []int{1, 2, 3, 4, 5, 0}, 0, 1, }, } err := util.WithNewOpenFileAsBufioWriter(test_file, func(w *bufio.Writer) error { for _, t_case := range test_cases { fmt.Fprintf(w, "%d\t%d", t_case.pos_y, t_case.neg_y) for i, v := range t_case.x { fmt.Fprintf(w, "\t%d:%d", i, v) } fmt.Fprint(w, "\n") } return nil }) defer func() { os.Remove(test_file) }() if err != nil { t.Errorf("Failed to create test file: %s:%s.", test_file, err) } loader := NewInstanceLoader(test_file, num_feature_classes) defer func() { loader.Close() }() var i_accessor DataInstanceAccessor i_accessor = loader for i, t_case := range test_cases { instance, err := i_accessor.NextInstance() if err != nil { t.Errorf("TestCase: #%d: %s.", i, err) } if !(&instance).Equal(&t_case) { t.Errorf("TestCase: #%d: Expected %v but got %v.", i, t_case, instance) } } _, err = i_accessor.NextInstance() if err != io.EOF { t.Errorf("Expected EOF but got %s.", err) } i_accessor.Reset() _, err = i_accessor.NextInstance() if err != nil { t.Errorf("Expected instance but got error.") } }
// Method GenerateBigramModel generates a bigram model from the inforamtion // collected so far and save it to the given file. The file format of the model // consists of multiple lines, each line contains first term, second term, // and bigram frequency representing P(first term | second term), i.e. the // probability of second term immediately follows the first term as seen from // the processed documents. func (g *NGramGenerator) GenerateBigramModel(filename string) error { modelWriter := func(w *bufio.Writer) error { for k, c := range g.biGram { //TODO(weidoliang): add smoothing p_k := float64(c) / float64(g.biGramCount) w.WriteString(fmt.Sprintf("%s %s %f\n", k.First, k.Second, p_k)) } return nil } return util.WithNewOpenFileAsBufioWriter(filename, modelWriter) }
// Method GenerateUnigramModel generates a unigram from the information collected // so far and save it to the given file. The file format of the model consists // of multiple lines of unigram and unigram frequency seperated by space. func (g *NGramGenerator) GenerateUnigramModel(filename string) error { modelWriter := func(w *bufio.Writer) error { for k, c := range g.uniGram { //TODO(weidoliang): add smoothing to avoid zero probabilities p_k := float64(c) / float64(g.uniGramCount) w.WriteString(fmt.Sprintf("%s %f\n", k, p_k)) } return nil } return util.WithNewOpenFileAsBufioWriter(filename, modelWriter) }
func saveDataToFile(filename string, data []DataInstance) error { return util.WithNewOpenFileAsBufioWriter(filename, func(w *bufio.Writer) error { for _, d := range data { fmt.Fprintf(w, "%d\t%d", d.pos_y, d.neg_y) for i, v := range d.x { fmt.Fprintf(w, "\t%d:%d", i, v) } fmt.Fprint(w, "\n") } return nil }) }
func TestCEDict(t *testing.T) { lines := []string{ "AA制 AA制 [A A zhi4] /to split the bill/to go Dutch/", "A咖 A咖 [A ka1] /class \"A\"/top grade/", "A片 A片 [A pian4] /adult movie/pornography/", "B型超聲 B型超声 [B xing2 chao1 sheng1] /type-B ultrasound/", "B超 B超 [B chao1] /type-B ultrasound/abbr. for B型超聲|B型超声[B xing2 chao1 sheng1]/", "C盤 C盘 [C pan2] /C drive or default startup drive (computing)/", "DNA鑒定 DNA鉴定 [D N A jian4 ding4] /DNA test/DNA testing/", "E仔 E仔 [e zai3] /MDMA (C11H15NO2)/", "G點 G点 [G dian3] /Gräfenberg Spot/G-Spot/", "K仔 K仔 [K zai3] /ketamine (slang)/", } dictResult := []struct { traditional string simplified string pinyin []string english []string }{ {"AA制", "AA制", []string{"A", "A", "zhi4"}, []string{"to split the bill", "to go Dutch"}}, {"A咖", "A咖", []string{"A", "ka1"}, []string{"class \"A\"", "top grade"}}, {"E仔", "E仔", []string{"e", "zai3"}, []string{"MDMA (C11H15NO2)"}}, } dictPath := "cedict_test.txt" lineWriter := func(writer *bufio.Writer) error { for _, line := range lines { _, err := writer.WriteString(line + "\n") if err != nil { t.Errorf("Failed to write line[%s] to file [%s]", line, dictPath) return err } } return nil } err := util.WithNewOpenFileAsBufioWriter(dictPath, lineWriter) if err != nil { t.Errorf("Failed to create test file %s", dictPath) return } defer func() { os.Remove(dictPath) }() dict, err := LoadCEDict(dictPath, TRADITION_CHINESE) if err != nil { t.Errorf("Failed at LoadCEDict(%s)", dictPath) } for _, t_case := range dictResult { term := dict.Lookup(t_case.traditional) if term == nil { t.Errorf("Failed to Lookup key %s", t_case.traditional) } if term.Alternative != t_case.simplified { t.Errorf("Lookup returns incorrent alternative representation, expected [%v] but got [%v]", t_case.simplified, term.Alternative) } if !AreStringSlicesEqual(term.Pinyin, t_case.pinyin) { t.Errorf("Lookup returns incorrect pinyin representation. Expected [%v] but got [%v]", t_case.pinyin, term.Pinyin) } if !AreStringSlicesEqual(term.English, t_case.english) { t.Errorf("Lookup returns invalid english representation expected [%v] but got [%v]", t_case.english, term.English) } } }