Beispiel #1
0
func (m *NGramModel) loadBigram(bigram string) error {
	m.Bigram = make(map[BiGramKey]float64)
	return util.ForEachLineInFile(bigram, func(line string) (bool, error) {
		isValidLine := true
		defer func() {
			if !isValidLine {
				log.Printf("Invalid line in bigram model[%s]: %s",
					bigram, line)
			}
		}()
		if !strings.HasPrefix(line, "#") {
			line = strings.Trim(line, " \t\f\r")
			fields := strings.Split(line, " ")
			if len(fields) != 3 {
				isValidLine = false
				return true, nil
			}
			p, err := strconv.ParseFloat(fields[2], 64)
			if err != nil {
				isValidLine = false
				return true, nil
			}
			key := BiGramKey{fields[0], fields[1]}
			old, found := m.Bigram[key]
			if found {
				log.Printf("Warning, duplicated p for key %v, old value is %v, new value is %v",
					key, old, p)
			}
			m.Bigram[key] = p
		}
		return true, nil
	})
}
Beispiel #2
0
func (m *NGramModel) loadUnigram(unigram string) error {
	m.Unigram = make(map[string]float64)
	return util.ForEachLineInFile(unigram, func(line string) (bool, error) {
		isValidLine := true
		err_msg := ""
		defer func() {
			if !isValidLine {
				log.Printf("Invalid line in unigram model[%s]: %s, %s",
					unigram, line, err_msg)
			}
		}()
		if !strings.HasPrefix(line, "#") {
			line = strings.Trim(line, " \t\n\r\f")
			fields := strings.Split(line, " ")
			if len(fields) != 2 {
				err_msg = fmt.Sprintf("Expected number of fields to be 2 but got %d.", len(fields))
				isValidLine = false
				return true, nil
			}
			p, err := strconv.ParseFloat(fields[1], 64)
			if err != nil {
				isValidLine = false
				err_msg = fmt.Sprintf("attempt to obtain probability from %s failed: %s", fields[1], err)
				return true, nil
			}
			old, found := m.Unigram[fields[0]]
			if found {
				log.Printf("Warning, duplicated p for key %s, old value is %v, new value is %v",
					fields[0], old, p)
			}
			m.Unigram[fields[0]] = p
		}
		return true, nil
	})
}
Beispiel #3
0
func evaluateSegmenter() {
	//cedict, err := LoadCEDict(cedict_path, cedict_key_type)
	//if err != nil {
	//	t.Fatalf("Failed to load CEDict[%s]: %s", cedict_path, err)
	//}
	model, err := ngram_model.LoadNGramModel(*unigramModel, *bigramModel)
	if err != nil {
		fmt.Printf("Failed to load model[%s,%s]: %s", *unigramModel, *bigramModel, err)
		return
	}
	segmenter := seg.NewSegmenter(nil, model)
	converter := util.NewUtf8Converter(*corpusCharSet)
	err = util.ForEachLineInFile(*corpus, func(line string) (bool, error) {
		line = converter.ConvertString(strings.Trim(line, " \t\n\r\f"))
		sample := strings.Replace(line, " ", "", -1)
		exp_result := strings.Split(line, " ")
		result, _ := segmenter.Segment(sample)

		is_eqv := len(result) == len(exp_result)
		for i, r := range result {
			if r != exp_result[i] {
				is_eqv = false
				break
			}
		}
		if !is_eqv {
			fmt.Printf("Segment(%s) expect result to be:\n%v\nbut got:\n%v\n\n",
				sample, exp_result, result)
		}
		return true, nil
	})
	if err != nil {
		fmt.Printf("Error encountered when attempting to evaluate segmenter: %s", err)
	}
}
Beispiel #4
0
// Function ProcessFile process the given file and incorporate the information
// into the NGramGenerator g for future N-Gram model generation.
func (g *NGramGenerator) ProcessFile(filename string) error {
	var decoder mahonia.Decoder
	if g.charset != "" {
		decoder = mahonia.NewDecoder(g.charset)
	}
	lineProcessor := func(line string) (bool, error) {
		line = strings.Trim(line, " \t\n\f\b\r")
		if decoder != nil {
			line = decoder.ConvertString(line)
		}
		tokens := strings.Split(line, " ")
		var prevToken string
		for i, t := range tokens {
			//Monogram frequency
			g.uniGram[t]++
			g.uniGramCount++
			//Bigram frequency
			var key BiGramKey
			if i == 0 {
				key = BiGramKey{SentenceStartTag, t}
			} else {
				key = BiGramKey{prevToken, t}
			}
			fmt.Printf("%v\n", key)
			g.biGram[key]++
			g.biGramCount++
			prevToken = t
		}
		return true, nil
	}
	return util.ForEachLineInFile(filename, lineProcessor)
}
Beispiel #5
0
func (s *SegCNCorpus) Load(path string) error {
	return util.ForEachLineInFile(path, func(line string) (bool, error) {
		line = strings.Trim(line, " \t\r\b\f")
		if (*s).decoder != nil {
			//TODO(weidoliang): Add conversion error check
			line = (*s).decoder.ConvertString(line)
		}
		(*s).sentences = append((*s).sentences, strings.Split(line, " "))
		return true, nil
	})
}
Beispiel #6
0
// Method LoadCEDict loads the CC-CEDict from the given path.
func LoadCEDict(path string, keyType int) (*CEDict, error) {
	lineHandler := util.NewPrefixDispatcher([]util.PrefixHandler{
		{"# ", nil},          //skip comments
		{"#! ", nil},         //skip meta information
		{"", fieldExtractor}, //default: extract fields
	})
	dict := CEDict{
		keyType,
		make(map[string]Term),
	}
	lineProcessor := func(line string) (bool, error) {
		fieldsI := lineHandler.Process(line)
		if fieldsI != nil {
			fields, fieldsOk := fieldsI.(cedictFields)
			if !fieldsOk {
				panic("Logic Error in the program, expected line handler to return key & term.")
			}
			var key, alter string
			switch dict.keyType {
			case SIMPLE_CHINESE:
				key = fields.simplified
				alter = fields.traditional
				break
			case TRADITION_CHINESE:
				key = fields.traditional
				alter = fields.simplified
				break
			default:
				panic(fmt.Sprintf("Invalid key type value: %d", dict.keyType))
				break
			}
			term := Term{
				alter,
				fields.pinyin,
				fields.english,
			}
			if oldTerm, found := dict.keyTerms[key]; found {
				log.Printf("Found duplicate definition for key %s, old value is %v, new value is %v",
					key, oldTerm, term)
			}
			dict.keyTerms[key] = term
		}
		return true, nil
	}
	if err := util.ForEachLineInFile(path, lineProcessor); err != nil {
		return nil, err
	}
	return &dict, nil
}