// Parse reads a vocabulary file vfile and returns an in-memory representation of it (Vocabulary). func Parse(vfile string) *Vocabulary { voc, err := os.Open(io.GetPath(vfile)) if err != nil { fmt.Printf("Error. Could not open file [%s].\n", vfile) panic(err) } defer voc.Close() var n int fmt.Fscanf(voc, "%d", &n) entries := make([]string, n) for i := 0; i < n; i++ { var j int var str string fmt.Fscanf(voc, "%d %s ", &j, &str) entries[j] = str } var m int fmt.Fscanf(voc, "%d", &m) l, block := 0, make([]int, m) for i := 0; i < m; i++ { var k int fmt.Fscanf(voc, "%d", &k) block[l] = k l++ } return NewVocabulary(entries, block) }
// Compile takes a plain text filename tfile and compiles it into a vocabulary file vfile. We // treat punctuation as words and letters with accent marks as different characters (é != e). A // vocabulary file contains K lines of word mapping where, for each line a number (which signals // the id of a word) is followed by the word in question. Next we have a series of numbers that // represent the id of each word in the order they appear in tfile. func Compile(tfile, vfile string) { text, err := os.Open(io.GetPath(tfile)) if err != nil { fmt.Printf("Error. Could not open file [%s].\n", tfile) panic(err) } defer text.Close() vocab := make(map[string]int) vc := 0 match := regexp.MustCompile(rex) var block []string cblock := 0 nwords := 0 // Read contents and store them into vocab and block. in := bufio.NewScanner(text) for in.Scan() { //fmt.Printf("Text: \"%s\"\nMatches:\n", in.Text()) v := match.FindAllString(in.Text(), -1) nv := len(v) //for i := 0; i < nv; i++ { //fmt.Printf(" <%s>", v[i]) //} //fmt.Printf("\n") if nv == 0 { continue } block = append(block, "") for i := 0; i < nv; i++ { str := strings.ToLower(v[i]) _, ok := vocab[str] if !ok { vocab[str] = vc vc++ } //fmt.Printf("%s -> %d\n", str, vocab[str]) block[cblock] = utils.StringConcat(block[cblock], strconv.Itoa(vocab[str])) if i < nv-1 { block[cblock] = utils.StringConcat(block[cblock], " ") } nwords++ } cblock++ } if err := in.Err(); err != nil { fmt.Printf("Error parsing file [%s].\n", tfile) panic(err) } // Write contents into vfile. vocf, err := os.Create(io.GetPath(vfile)) if err != nil { fmt.Printf("Error. Could not open file [%s].\n", vfile) panic(err) } defer vocf.Close() // Number of vocabulary entries. fmt.Fprintf(vocf, "%d\n", len(vocab)) for k, v := range vocab { // Write each entry as a pair (id, word). fmt.Fprintf(vocf, "%d %s\n", v, k) } // Number of words in block. fmt.Fprintf(vocf, "%d\n", nwords) for i := 0; i < cblock; i++ { // Write all lines as a list of ids. fmt.Fprintln(vocf, block[i]) } }