// DetectCharset returns best guess for the reesponse body character set. func (res *Response) DetectCharset() { // Detect via BOM / HTML meta tag _, cs1, ok1 := charset.DetermineEncoding(res.Body, res.MediaType) // Detect via ICU cs2, ok2, html := "", false, false var det *chardet.Detector if strings.Contains(res.MediaType, "html") || true { det = chardet.NewHtmlDetector() html = true } else { det = chardet.NewTextDetector() } r, err := det.DetectAll(res.Body) if err == nil && len(r) > 0 { cs2 = strings.ToLower(r[0].Charset) ok2 = r[0].Confidence > 50 } // Prefer charset if HTML, otherwise ICU if !ok2 && (ok1 || html) { res.Charset = cs1 } else { res.Charset = cs2 } // fmt.Printf("Detected charset via go.net/html/charset: %s (%t)\n", cs1, ok1) // fmt.Printf("Detected charset via saintfish/chardet: %s (%d)\n", cs2, r[0].Confidence) }
func main() { app.Version("0.1.0") app.Parse(os.Args[1:]) inputFile := os.Stdin err := errors.New("") inputFileName := *fileName if inputFileName != "" { inputFile, err = os.Open(inputFileName) if err != nil { fmt.Println(err) os.Exit(-1) } } buffer := make([]byte, *bufferSize) size, _ := io.ReadFull(inputFile, buffer) input := buffer[:size] detector := chardet.NewHtmlDetector() if *forText == true { detector = chardet.NewTextDetector() } detectResult, err := detector.DetectBest(input) if err != nil { fmt.Println(err) os.Exit(-1) } fmt.Print(detectResult.Charset) }
func TestDetector(t *testing.T) { type file_charset_language struct { File string IsHtml bool Charset string Language string } var data = []file_charset_language{ {"utf8.html", true, "UTF-8", ""}, {"utf8_bom.html", true, "UTF-8", ""}, {"8859_1_en.html", true, "ISO-8859-1", "en"}, {"8859_1_da.html", true, "ISO-8859-1", "da"}, {"8859_1_de.html", true, "ISO-8859-1", "de"}, {"8859_1_es.html", true, "ISO-8859-1", "es"}, {"8859_1_fr.html", true, "ISO-8859-1", "fr"}, {"8859_1_pt.html", true, "ISO-8859-1", "pt"}, {"shift_jis.html", true, "Shift_JIS", "ja"}, {"gb18030.html", true, "GB-18030", "zh"}, {"euc_jp.html", true, "EUC-JP", "ja"}, {"euc_kr.html", true, "EUC-KR", "ko"}, {"big5.html", true, "Big5", "zh"}, } textDetector := chardet.NewTextDetector() htmlDetector := chardet.NewHtmlDetector() buffer := make([]byte, 32<<10) for _, d := range data { f, err := os.Open(filepath.Join("testdata", d.File)) if err != nil { t.Fatal(err) } defer f.Close() size, _ := io.ReadFull(f, buffer) input := buffer[:size] var detector = textDetector if d.IsHtml { detector = htmlDetector } result, err := detector.DetectBest(input) if err != nil { t.Fatal(err) } if result.Charset != d.Charset { t.Errorf("Expected charset %s, actual %s", d.Charset, result.Charset) } if result.Language != d.Language { t.Errorf("Expected language %s, actual %s", d.Language, result.Language) } } }
func GetContentType(data []byte, url string) string { ct := http.DetectContentType(data) if strings.Contains(ct, "text/plain") { if strings.Contains(url, "css") { ct = "text/css" } if strings.HasSuffix(url, ".js") { ct = "text/javascript" } } if strings.Contains(ct, "text/html") { dt := chardet.NewHtmlDetector() r, err := dt.DetectBest(data) if err == nil { ct = "text/html;charset=" + r.Charset } } return ct }
func NewHTMLCleaner() *HTMLCleaner { ret := HTMLCleaner{} ret.detector = chardet.NewHtmlDetector() ret.gbk = mahonia.NewDecoder("gb18030") return &ret }
func fetchPage(url string) string { req, err := http.NewRequest("GET", url, nil) req.Header.Set("User-Agent", scraperConfig.UserAgent) httpClient := http.Client{ Transport: &http.Transport{ Dial: timeoutDialler(time.Duration(10 * time.Second)), DisableKeepAlives: true, }, } resp, err := httpClient.Do(req) if err != nil { log.Println("HTTP_ERROR:", err) return "" } defer resp.Body.Close() if resp.StatusCode == 200 { var dataStream io.Reader switch charType := fetchCharset(resp.Header.Get("Content-Type")); { case charType == "utf-8": dataStream = resp.Body case chartypeSet[charType]: // charset in available list for conversion charsetStream, err := charset.NewReader(charType, resp.Body) if err != nil { log.Println("ENCODING_ERROR:", err) } else { dataStream = charsetStream } default: //need to guess chartype bodyBytes, err := ioutil.ReadAll(resp.Body) if err != nil { log.Println("IO_ERROR:", err) } detector := chardet.NewHtmlDetector() result, err := detector.DetectBest(bodyBytes) if err != nil { log.Println("ENCODING_ERROR no_known_encoding", url) return "" } charType = strings.ToLower(result.Charset) if chartypeSet[charType] { dataStream = bytes.NewReader(bodyBytes) charsetStream, err := charset.NewReader(charType, dataStream) if err != nil { log.Println("ENCODING_ERROR:", err) } else { dataStream = charsetStream } } } if dataStream != nil { var bodyBytes []byte bodyBytes, err := ioutil.ReadAll(dataStream) if err != nil { log.Println("ERROR:", err) } return string(bodyBytes) } else { log.Println("ENCODING_ERROR: no suitable encoding found for", url) } } return "" }