Example #1
0
// DetectCharset returns best guess for the reesponse body character set.
func (res *Response) DetectCharset() {
	// Detect via BOM / HTML meta tag
	_, cs1, ok1 := charset.DetermineEncoding(res.Body, res.MediaType)

	// Detect via ICU
	cs2, ok2, html := "", false, false
	var det *chardet.Detector
	if strings.Contains(res.MediaType, "html") || true {
		det = chardet.NewHtmlDetector()
		html = true
	} else {
		det = chardet.NewTextDetector()
	}
	r, err := det.DetectAll(res.Body)
	if err == nil && len(r) > 0 {
		cs2 = strings.ToLower(r[0].Charset)
		ok2 = r[0].Confidence > 50
	}

	// Prefer charset if HTML, otherwise ICU
	if !ok2 && (ok1 || html) {
		res.Charset = cs1
	} else {
		res.Charset = cs2
	}

	// fmt.Printf("Detected charset via go.net/html/charset: %s (%t)\n", cs1, ok1)
	// fmt.Printf("Detected charset via saintfish/chardet:   %s (%d)\n", cs2, r[0].Confidence)
}
Example #2
0
File: main.go Project: xissy/icu
func main() {
	app.Version("0.1.0")
	app.Parse(os.Args[1:])

	inputFile := os.Stdin
	err := errors.New("")

	inputFileName := *fileName
	if inputFileName != "" {
		inputFile, err = os.Open(inputFileName)
		if err != nil {
			fmt.Println(err)
			os.Exit(-1)
		}
	}

	buffer := make([]byte, *bufferSize)
	size, _ := io.ReadFull(inputFile, buffer)

	input := buffer[:size]
	detector := chardet.NewHtmlDetector()
	if *forText == true {
		detector = chardet.NewTextDetector()
	}
	detectResult, err := detector.DetectBest(input)
	if err != nil {
		fmt.Println(err)
		os.Exit(-1)
	}

	fmt.Print(detectResult.Charset)
}
Example #3
0
func TestDetector(t *testing.T) {
	type file_charset_language struct {
		File     string
		IsHtml   bool
		Charset  string
		Language string
	}
	var data = []file_charset_language{
		{"utf8.html", true, "UTF-8", ""},
		{"utf8_bom.html", true, "UTF-8", ""},
		{"8859_1_en.html", true, "ISO-8859-1", "en"},
		{"8859_1_da.html", true, "ISO-8859-1", "da"},
		{"8859_1_de.html", true, "ISO-8859-1", "de"},
		{"8859_1_es.html", true, "ISO-8859-1", "es"},
		{"8859_1_fr.html", true, "ISO-8859-1", "fr"},
		{"8859_1_pt.html", true, "ISO-8859-1", "pt"},
		{"shift_jis.html", true, "Shift_JIS", "ja"},
		{"gb18030.html", true, "GB-18030", "zh"},
		{"euc_jp.html", true, "EUC-JP", "ja"},
		{"euc_kr.html", true, "EUC-KR", "ko"},
		{"big5.html", true, "Big5", "zh"},
	}

	textDetector := chardet.NewTextDetector()
	htmlDetector := chardet.NewHtmlDetector()
	buffer := make([]byte, 32<<10)
	for _, d := range data {
		f, err := os.Open(filepath.Join("testdata", d.File))
		if err != nil {
			t.Fatal(err)
		}
		defer f.Close()
		size, _ := io.ReadFull(f, buffer)
		input := buffer[:size]
		var detector = textDetector
		if d.IsHtml {
			detector = htmlDetector
		}
		result, err := detector.DetectBest(input)
		if err != nil {
			t.Fatal(err)
		}
		if result.Charset != d.Charset {
			t.Errorf("Expected charset %s, actual %s", d.Charset, result.Charset)
		}
		if result.Language != d.Language {
			t.Errorf("Expected language %s, actual %s", d.Language, result.Language)
		}
	}
}
Example #4
0
func GetContentType(data []byte, url string) string {
	ct := http.DetectContentType(data)
	if strings.Contains(ct, "text/plain") {
		if strings.Contains(url, "css") {
			ct = "text/css"
		}
		if strings.HasSuffix(url, ".js") {
			ct = "text/javascript"
		}
	}
	if strings.Contains(ct, "text/html") {
		dt := chardet.NewHtmlDetector()
		r, err := dt.DetectBest(data)
		if err == nil {
			ct = "text/html;charset=" + r.Charset
		}
	}
	return ct
}
Example #5
0
func NewHTMLCleaner() *HTMLCleaner {
	ret := HTMLCleaner{}
	ret.detector = chardet.NewHtmlDetector()
	ret.gbk = mahonia.NewDecoder("gb18030")
	return &ret
}
Example #6
0
func fetchPage(url string) string {
	req, err := http.NewRequest("GET", url, nil)
	req.Header.Set("User-Agent", scraperConfig.UserAgent)

	httpClient := http.Client{
		Transport: &http.Transport{
			Dial:              timeoutDialler(time.Duration(10 * time.Second)),
			DisableKeepAlives: true,
		},
	}

	resp, err := httpClient.Do(req)
	if err != nil {
		log.Println("HTTP_ERROR:", err)
		return ""
	}
	defer resp.Body.Close()

	if resp.StatusCode == 200 {
		var dataStream io.Reader

		switch charType := fetchCharset(resp.Header.Get("Content-Type")); {

		case charType == "utf-8":
			dataStream = resp.Body

		case chartypeSet[charType]:
			// charset in available list for conversion
			charsetStream, err := charset.NewReader(charType, resp.Body)
			if err != nil {
				log.Println("ENCODING_ERROR:", err)
			} else {
				dataStream = charsetStream
			}

		default:
			//need to guess chartype
			bodyBytes, err := ioutil.ReadAll(resp.Body)
			if err != nil {
				log.Println("IO_ERROR:", err)
			}

			detector := chardet.NewHtmlDetector()
			result, err := detector.DetectBest(bodyBytes)
			if err != nil {
				log.Println("ENCODING_ERROR no_known_encoding", url)
				return ""
			}

			charType = strings.ToLower(result.Charset)
			if chartypeSet[charType] {
				dataStream = bytes.NewReader(bodyBytes)
				charsetStream, err := charset.NewReader(charType, dataStream)
				if err != nil {
					log.Println("ENCODING_ERROR:", err)
				} else {
					dataStream = charsetStream
				}
			}
		}

		if dataStream != nil {
			var bodyBytes []byte
			bodyBytes, err := ioutil.ReadAll(dataStream)
			if err != nil {
				log.Println("ERROR:", err)
			}

			return string(bodyBytes)
		} else {
			log.Println("ENCODING_ERROR: no suitable encoding found for", url)
		}
	}
	return ""
}