Exemplo n.º 1
0
// Function ProcessFile process the given file and incorporate the information
// into the NGramGenerator g for future N-Gram model generation.
func (g *NGramGenerator) ProcessFile(filename string) error {
	var decoder mahonia.Decoder
	if g.charset != "" {
		decoder = mahonia.NewDecoder(g.charset)
	}
	lineProcessor := func(line string) (bool, error) {
		line = strings.Trim(line, " \t\n\f\b\r")
		if decoder != nil {
			line = decoder.ConvertString(line)
		}
		tokens := strings.Split(line, " ")
		var prevToken string
		for i, t := range tokens {
			//Monogram frequency
			g.uniGram[t]++
			g.uniGramCount++
			//Bigram frequency
			var key BiGramKey
			if i == 0 {
				key = BiGramKey{SentenceStartTag, t}
			} else {
				key = BiGramKey{prevToken, t}
			}
			fmt.Printf("%v\n", key)
			g.biGram[key]++
			g.biGramCount++
			prevToken = t
		}
		return true, nil
	}
	return util.ForEachLineInFile(filename, lineProcessor)
}