func main() { app.Version("0.1.0") app.Parse(os.Args[1:]) inputFile := os.Stdin err := errors.New("") inputFileName := *fileName if inputFileName != "" { inputFile, err = os.Open(inputFileName) if err != nil { fmt.Println(err) os.Exit(-1) } } buffer := make([]byte, *bufferSize) size, _ := io.ReadFull(inputFile, buffer) input := buffer[:size] detector := chardet.NewHtmlDetector() if *forText == true { detector = chardet.NewTextDetector() } detectResult, err := detector.DetectBest(input) if err != nil { fmt.Println(err) os.Exit(-1) } fmt.Print(detectResult.Charset) }
// DetectCharset returns best guess for the reesponse body character set. func (res *Response) DetectCharset() { // Detect via BOM / HTML meta tag _, cs1, ok1 := charset.DetermineEncoding(res.Body, res.MediaType) // Detect via ICU cs2, ok2, html := "", false, false var det *chardet.Detector if strings.Contains(res.MediaType, "html") || true { det = chardet.NewHtmlDetector() html = true } else { det = chardet.NewTextDetector() } r, err := det.DetectAll(res.Body) if err == nil && len(r) > 0 { cs2 = strings.ToLower(r[0].Charset) ok2 = r[0].Confidence > 50 } // Prefer charset if HTML, otherwise ICU if !ok2 && (ok1 || html) { res.Charset = cs1 } else { res.Charset = cs2 } // fmt.Printf("Detected charset via go.net/html/charset: %s (%t)\n", cs1, ok1) // fmt.Printf("Detected charset via saintfish/chardet: %s (%d)\n", cs2, r[0].Confidence) }
func getFileEncoding(data []byte) (string, error) { detector := chardet.NewTextDetector() r, err := detector.DetectBest(data) if err != nil { return "", err } return r.Charset, nil }
func detectEncoding(data []byte) (string, error) { detector := chardet.NewTextDetector() detected, err := detector.DetectBest(data) if err != nil { return "", err } return detected.Charset, nil }
func ExampleTextDetector() { detector := chardet.NewTextDetector() result, err := detector.DetectBest(zh_gb18030_text) if err == nil { fmt.Printf( "Detected charset is %s, language is %s", result.Charset, result.Language) } // Output: // Detected charset is GB-18030, language is zh }
func toUtf8(content []byte) (error, string) { detector := chardet.NewTextDetector() result, err := detector.DetectBest(content) if err != nil { return err, "" } if result.Charset == "utf8" { return nil, string(content) } decoder := mahonia.NewDecoder(result.Charset) return nil, decoder.ConvertString(string(content)) }
func TestDetector(t *testing.T) { type file_charset_language struct { File string IsHtml bool Charset string Language string } var data = []file_charset_language{ {"utf8.html", true, "UTF-8", ""}, {"utf8_bom.html", true, "UTF-8", ""}, {"8859_1_en.html", true, "ISO-8859-1", "en"}, {"8859_1_da.html", true, "ISO-8859-1", "da"}, {"8859_1_de.html", true, "ISO-8859-1", "de"}, {"8859_1_es.html", true, "ISO-8859-1", "es"}, {"8859_1_fr.html", true, "ISO-8859-1", "fr"}, {"8859_1_pt.html", true, "ISO-8859-1", "pt"}, {"shift_jis.html", true, "Shift_JIS", "ja"}, {"gb18030.html", true, "GB-18030", "zh"}, {"euc_jp.html", true, "EUC-JP", "ja"}, {"euc_kr.html", true, "EUC-KR", "ko"}, {"big5.html", true, "Big5", "zh"}, } textDetector := chardet.NewTextDetector() htmlDetector := chardet.NewHtmlDetector() buffer := make([]byte, 32<<10) for _, d := range data { f, err := os.Open(filepath.Join("testdata", d.File)) if err != nil { t.Fatal(err) } defer f.Close() size, _ := io.ReadFull(f, buffer) input := buffer[:size] var detector = textDetector if d.IsHtml { detector = htmlDetector } result, err := detector.DetectBest(input) if err != nil { t.Fatal(err) } if result.Charset != d.Charset { t.Errorf("Expected charset %s, actual %s", d.Charset, result.Charset) } if result.Language != d.Language { t.Errorf("Expected language %s, actual %s", d.Language, result.Language) } } }
func DetectEncoding(b []byte) (Detected, error) { d := chardet.NewTextDetector() res, err := d.DetectBest(b) if err != nil { return Detected{}, err } if res.Charset == "GB-18030" { // set canonical name for this encoding type (this is a chardet bug) res.Charset = "GB18030" } return Detected{Charset: res.Charset, Confidence: res.Confidence}, nil }
func convertToUtf8(s string) string { b := []byte(s) d := chardet.NewTextDetector() r, err := d.DetectBest(b) if err != nil { return fmt.Sprintf("<Can't detect string charset: %s>", err.Error()) } encoding, _ := charset.Lookup(r.Charset) if encoding == nil { return fmt.Sprintf("<Can't find encoding: %s>", r.Charset) } str, _, err := transform.String(encoding.NewDecoder(), s) if err != nil { return fmt.Sprintf("<Can't convert string from encoding %s to UTF8: %s>", r.Charset, err.Error()) } return str }
import ( "fmt" "github.com/saintfish/chardet" "golang.org/x/text/encoding" "golang.org/x/text/encoding/charmap" "golang.org/x/text/encoding/japanese" "golang.org/x/text/encoding/korean" "golang.org/x/text/encoding/simplifiedchinese" "golang.org/x/text/encoding/traditionalchinese" "golang.org/x/text/encoding/unicode" "golang.org/x/text/transform" ) var ( charsetDetector = chardet.NewTextDetector() charsetDetectors = map[string]encoding.Encoding{ "Big5": traditionalchinese.Big5, "EUC-JP": japanese.EUCJP, "EUC-KR": korean.EUCKR, "GB-18030": simplifiedchinese.GB18030, "ISO-2022-JP": japanese.ISO2022JP, "ISO-8859-5": charmap.ISO8859_5, "ISO-8859-6": charmap.ISO8859_6, "ISO-8859-7": charmap.ISO8859_7, "ISO-8859-8": charmap.ISO8859_8, "ISO-8859-8-I": charmap.ISO8859_8I, "KOI8-R": charmap.KOI8R, "Shift_JIS": japanese.ShiftJIS, "UTF-16BE": unicode.UTF16(unicode.BigEndian, unicode.UseBOM), "UTF-16LE": unicode.UTF16(unicode.LittleEndian, unicode.UseBOM),