// Function ProcessFile process the given file and incorporate the information // into the NGramGenerator g for future N-Gram model generation. func (g *NGramGenerator) ProcessFile(filename string) error { var decoder mahonia.Decoder if g.charset != "" { decoder = mahonia.NewDecoder(g.charset) } lineProcessor := func(line string) (bool, error) { line = strings.Trim(line, " \t\n\f\b\r") if decoder != nil { line = decoder.ConvertString(line) } tokens := strings.Split(line, " ") var prevToken string for i, t := range tokens { //Monogram frequency g.uniGram[t]++ g.uniGramCount++ //Bigram frequency var key BiGramKey if i == 0 { key = BiGramKey{SentenceStartTag, t} } else { key = BiGramKey{prevToken, t} } fmt.Printf("%v\n", key) g.biGram[key]++ g.biGramCount++ prevToken = t } return true, nil } return util.ForEachLineInFile(filename, lineProcessor) }
func convert(s string) string { var dec mahonia.Decoder dec = mahonia.NewDecoder("gbk") if ret, ok := dec.ConvertStringOK(s); ok { fmt.Println("GBK to UTF-8: ", ret) return ret } return s }
// open func openUrl(url string) (data string, err error) { var resp *http.Response var raw []byte var dec mahonia.Decoder = nil debug.Println("Get:", url) resp, err = http.Get(url) if err != nil { return } if resp.StatusCode != http.StatusOK { err = errors.New("Bad Status:" + resp.Status) return } // only handle html files ctype := resp.Header.Get("Content-Type") if -1 == strings.Index(ctype, "text/html") { err = errors.New("Not a html file") return } // try enconding: gbk\big5\utf8 charset := "" if seps := strings.Split(ctype, "="); len(seps) >= 2 { charset = seps[1] charset = strings.ToLower(charset) if strings.HasPrefix(charset, "gb") { charset = "gb18030" dec = gbk } else if strings.HasPrefix(charset, "big") { charset = "big5" dec = big5 } else if strings.HasPrefix(charset, "utf") || charset == "" { charset = "utf8" dec = nil } else { err = errors.New("Unsupported charset:" + charset) return } } else { dec = nil } debug.Println("Using charset:", charset) // TODO gzip handle contentEncoding := resp.Header.Get("Content-Encoding") if contentEncoding == "gzip" { err = errors.New("Content-Encoding:" + contentEncoding + "temporally not supported") return } // read the response if dec != nil { raw, err = ioutil.ReadAll(dec.NewReader(resp.Body)) } else { raw, err = ioutil.ReadAll(resp.Body) } if err != nil { return } defer resp.Body.Close() data = string(raw) debug.Println("Data:", data) return }