func (m *NGramModel) loadBigram(bigram string) error { m.Bigram = make(map[BiGramKey]float64) return util.ForEachLineInFile(bigram, func(line string) (bool, error) { isValidLine := true defer func() { if !isValidLine { log.Printf("Invalid line in bigram model[%s]: %s", bigram, line) } }() if !strings.HasPrefix(line, "#") { line = strings.Trim(line, " \t\f\r") fields := strings.Split(line, " ") if len(fields) != 3 { isValidLine = false return true, nil } p, err := strconv.ParseFloat(fields[2], 64) if err != nil { isValidLine = false return true, nil } key := BiGramKey{fields[0], fields[1]} old, found := m.Bigram[key] if found { log.Printf("Warning, duplicated p for key %v, old value is %v, new value is %v", key, old, p) } m.Bigram[key] = p } return true, nil }) }
func (m *NGramModel) loadUnigram(unigram string) error { m.Unigram = make(map[string]float64) return util.ForEachLineInFile(unigram, func(line string) (bool, error) { isValidLine := true err_msg := "" defer func() { if !isValidLine { log.Printf("Invalid line in unigram model[%s]: %s, %s", unigram, line, err_msg) } }() if !strings.HasPrefix(line, "#") { line = strings.Trim(line, " \t\n\r\f") fields := strings.Split(line, " ") if len(fields) != 2 { err_msg = fmt.Sprintf("Expected number of fields to be 2 but got %d.", len(fields)) isValidLine = false return true, nil } p, err := strconv.ParseFloat(fields[1], 64) if err != nil { isValidLine = false err_msg = fmt.Sprintf("attempt to obtain probability from %s failed: %s", fields[1], err) return true, nil } old, found := m.Unigram[fields[0]] if found { log.Printf("Warning, duplicated p for key %s, old value is %v, new value is %v", fields[0], old, p) } m.Unigram[fields[0]] = p } return true, nil }) }
func evaluateSegmenter() { //cedict, err := LoadCEDict(cedict_path, cedict_key_type) //if err != nil { // t.Fatalf("Failed to load CEDict[%s]: %s", cedict_path, err) //} model, err := ngram_model.LoadNGramModel(*unigramModel, *bigramModel) if err != nil { fmt.Printf("Failed to load model[%s,%s]: %s", *unigramModel, *bigramModel, err) return } segmenter := seg.NewSegmenter(nil, model) converter := util.NewUtf8Converter(*corpusCharSet) err = util.ForEachLineInFile(*corpus, func(line string) (bool, error) { line = converter.ConvertString(strings.Trim(line, " \t\n\r\f")) sample := strings.Replace(line, " ", "", -1) exp_result := strings.Split(line, " ") result, _ := segmenter.Segment(sample) is_eqv := len(result) == len(exp_result) for i, r := range result { if r != exp_result[i] { is_eqv = false break } } if !is_eqv { fmt.Printf("Segment(%s) expect result to be:\n%v\nbut got:\n%v\n\n", sample, exp_result, result) } return true, nil }) if err != nil { fmt.Printf("Error encountered when attempting to evaluate segmenter: %s", err) } }
// Function ProcessFile process the given file and incorporate the information // into the NGramGenerator g for future N-Gram model generation. func (g *NGramGenerator) ProcessFile(filename string) error { var decoder mahonia.Decoder if g.charset != "" { decoder = mahonia.NewDecoder(g.charset) } lineProcessor := func(line string) (bool, error) { line = strings.Trim(line, " \t\n\f\b\r") if decoder != nil { line = decoder.ConvertString(line) } tokens := strings.Split(line, " ") var prevToken string for i, t := range tokens { //Monogram frequency g.uniGram[t]++ g.uniGramCount++ //Bigram frequency var key BiGramKey if i == 0 { key = BiGramKey{SentenceStartTag, t} } else { key = BiGramKey{prevToken, t} } fmt.Printf("%v\n", key) g.biGram[key]++ g.biGramCount++ prevToken = t } return true, nil } return util.ForEachLineInFile(filename, lineProcessor) }
func (s *SegCNCorpus) Load(path string) error { return util.ForEachLineInFile(path, func(line string) (bool, error) { line = strings.Trim(line, " \t\r\b\f") if (*s).decoder != nil { //TODO(weidoliang): Add conversion error check line = (*s).decoder.ConvertString(line) } (*s).sentences = append((*s).sentences, strings.Split(line, " ")) return true, nil }) }
// Method LoadCEDict loads the CC-CEDict from the given path. func LoadCEDict(path string, keyType int) (*CEDict, error) { lineHandler := util.NewPrefixDispatcher([]util.PrefixHandler{ {"# ", nil}, //skip comments {"#! ", nil}, //skip meta information {"", fieldExtractor}, //default: extract fields }) dict := CEDict{ keyType, make(map[string]Term), } lineProcessor := func(line string) (bool, error) { fieldsI := lineHandler.Process(line) if fieldsI != nil { fields, fieldsOk := fieldsI.(cedictFields) if !fieldsOk { panic("Logic Error in the program, expected line handler to return key & term.") } var key, alter string switch dict.keyType { case SIMPLE_CHINESE: key = fields.simplified alter = fields.traditional break case TRADITION_CHINESE: key = fields.traditional alter = fields.simplified break default: panic(fmt.Sprintf("Invalid key type value: %d", dict.keyType)) break } term := Term{ alter, fields.pinyin, fields.english, } if oldTerm, found := dict.keyTerms[key]; found { log.Printf("Found duplicate definition for key %s, old value is %v, new value is %v", key, oldTerm, term) } dict.keyTerms[key] = term } return true, nil } if err := util.ForEachLineInFile(path, lineProcessor); err != nil { return nil, err } return &dict, nil }