Пример #1
0
// Parse reads a vocabulary file vfile and returns an in-memory representation of it (Vocabulary).
func Parse(vfile string) *Vocabulary {
	voc, err := os.Open(io.GetPath(vfile))

	if err != nil {
		fmt.Printf("Error. Could not open file [%s].\n", vfile)
		panic(err)
	}
	defer voc.Close()

	var n int
	fmt.Fscanf(voc, "%d", &n)

	entries := make([]string, n)
	for i := 0; i < n; i++ {
		var j int
		var str string
		fmt.Fscanf(voc, "%d %s ", &j, &str)
		entries[j] = str
	}

	var m int
	fmt.Fscanf(voc, "%d", &m)

	l, block := 0, make([]int, m)
	for i := 0; i < m; i++ {
		var k int
		fmt.Fscanf(voc, "%d", &k)
		block[l] = k
		l++
	}

	return NewVocabulary(entries, block)
}
Пример #2
0
// Compile takes a plain text filename tfile and compiles it into a vocabulary file vfile. We
// treat punctuation as words and letters with accent marks as different characters (é != e). A
// vocabulary file contains K lines of word mapping where, for each line a number (which signals
// the id of a word) is followed by the word in question. Next we have a series of numbers that
// represent the id of each word in the order they appear in tfile.
func Compile(tfile, vfile string) {
	text, err := os.Open(io.GetPath(tfile))
	if err != nil {
		fmt.Printf("Error. Could not open file [%s].\n", tfile)
		panic(err)
	}
	defer text.Close()

	vocab := make(map[string]int)
	vc := 0
	match := regexp.MustCompile(rex)

	var block []string
	cblock := 0
	nwords := 0

	// Read contents and store them into vocab and block.
	in := bufio.NewScanner(text)
	for in.Scan() {
		//fmt.Printf("Text: \"%s\"\nMatches:\n", in.Text())
		v := match.FindAllString(in.Text(), -1)
		nv := len(v)

		//for i := 0; i < nv; i++ {
		//fmt.Printf(" <%s>", v[i])
		//}
		//fmt.Printf("\n")

		if nv == 0 {
			continue
		}
		block = append(block, "")
		for i := 0; i < nv; i++ {
			str := strings.ToLower(v[i])
			_, ok := vocab[str]
			if !ok {
				vocab[str] = vc
				vc++
			}
			//fmt.Printf("%s -> %d\n", str, vocab[str])
			block[cblock] = utils.StringConcat(block[cblock], strconv.Itoa(vocab[str]))
			if i < nv-1 {
				block[cblock] = utils.StringConcat(block[cblock], " ")
			}
			nwords++
		}
		cblock++
	}

	if err := in.Err(); err != nil {
		fmt.Printf("Error parsing file [%s].\n", tfile)
		panic(err)
	}

	// Write contents into vfile.
	vocf, err := os.Create(io.GetPath(vfile))

	if err != nil {
		fmt.Printf("Error. Could not open file [%s].\n", vfile)
		panic(err)
	}
	defer vocf.Close()

	// Number of vocabulary entries.
	fmt.Fprintf(vocf, "%d\n", len(vocab))
	for k, v := range vocab {
		// Write each entry as a pair (id, word).
		fmt.Fprintf(vocf, "%d %s\n", v, k)
	}
	// Number of words in block.
	fmt.Fprintf(vocf, "%d\n", nwords)
	for i := 0; i < cblock; i++ {
		// Write all lines as a list of ids.
		fmt.Fprintln(vocf, block[i])
	}
}