Ejemplo n.º 1
0
// Function ProcessFile process the given file and incorporate the information
// into the NGramGenerator g for future N-Gram model generation.
func (g *NGramGenerator) ProcessFile(filename string) error {
	var decoder mahonia.Decoder
	if g.charset != "" {
		decoder = mahonia.NewDecoder(g.charset)
	}
	lineProcessor := func(line string) (bool, error) {
		line = strings.Trim(line, " \t\n\f\b\r")
		if decoder != nil {
			line = decoder.ConvertString(line)
		}
		tokens := strings.Split(line, " ")
		var prevToken string
		for i, t := range tokens {
			//Monogram frequency
			g.uniGram[t]++
			g.uniGramCount++
			//Bigram frequency
			var key BiGramKey
			if i == 0 {
				key = BiGramKey{SentenceStartTag, t}
			} else {
				key = BiGramKey{prevToken, t}
			}
			fmt.Printf("%v\n", key)
			g.biGram[key]++
			g.biGramCount++
			prevToken = t
		}
		return true, nil
	}
	return util.ForEachLineInFile(filename, lineProcessor)
}
Ejemplo n.º 2
0
func convert(s string) string {
	var dec mahonia.Decoder
	dec = mahonia.NewDecoder("gbk")
	if ret, ok := dec.ConvertStringOK(s); ok {
		fmt.Println("GBK to UTF-8: ", ret)
		return ret
	}

	return s
}
Ejemplo n.º 3
0
// open
func openUrl(url string) (data string, err error) {
	var resp *http.Response
	var raw []byte
	var dec mahonia.Decoder = nil
	debug.Println("Get:", url)
	resp, err = http.Get(url)
	if err != nil {
		return
	}
	if resp.StatusCode != http.StatusOK {
		err = errors.New("Bad Status:" + resp.Status)
		return
	}
	// only handle html files
	ctype := resp.Header.Get("Content-Type")
	if -1 == strings.Index(ctype, "text/html") {
		err = errors.New("Not a html file")
		return
	}
	// try enconding: gbk\big5\utf8
	charset := ""
	if seps := strings.Split(ctype, "="); len(seps) >= 2 {
		charset = seps[1]
		charset = strings.ToLower(charset)
		if strings.HasPrefix(charset, "gb") {
			charset = "gb18030"
			dec = gbk
		} else if strings.HasPrefix(charset, "big") {
			charset = "big5"
			dec = big5
		} else if strings.HasPrefix(charset, "utf") || charset == "" {
			charset = "utf8"
			dec = nil
		} else {
			err = errors.New("Unsupported charset:" + charset)
			return
		}
	} else {
		dec = nil
	}
	debug.Println("Using charset:", charset)
	// TODO gzip handle
	contentEncoding := resp.Header.Get("Content-Encoding")
	if contentEncoding == "gzip" {
		err = errors.New("Content-Encoding:" + contentEncoding + "temporally not supported")
		return
	}
	// read the response
	if dec != nil {
		raw, err = ioutil.ReadAll(dec.NewReader(resp.Body))
	} else {
		raw, err = ioutil.ReadAll(resp.Body)
	}
	if err != nil {
		return
	}
	defer resp.Body.Close()
	data = string(raw)
	debug.Println("Data:", data)
	return
}