Example #1
0
func DetermineEncoding(givenType *ParsedMimeType, head []byte) encoding.Encoding {
	// Try to determine encoding ala whatwg "encoding sniffing algorithm"
	enc, _, certain := charset.DetermineEncoding(head, givenType.String())
	if certain {
		return enc
	}

	// Before doing a byte-level encoding detection, check for xml
	// declaration
	s := strings.TrimSpace(string(head))
	if strings.HasPrefix(s, "<?xml ") {
		s = strings.TrimPrefix(s, "<?xml ")
		end := strings.Index(s, "?>")
		if end > 0 {
			s = s[:end]
			for _, match := range attrRe.FindAllStringSubmatch(s, -1) {
				if match[1] == "encoding" {
					if detectedEnc, _ := charset.Lookup(match[2]); detectedEnc != nil {
						return detectedEnc
					}
				}
			}
		}
	}

	var detector *chardet.Detector
	if givenType.IsHtml() || givenType.IsXml() {
		detector = chardet.NewHtmlDetector()
	} else {
		detector = chardet.NewTextDetector()
	}

	detected, err := detector.DetectBest(head)
	if err != nil {
		fmt.Fprintln(os.Stderr, "Error detecting character set, using default.")
		return enc
	}

	if detected.Charset == "GB-18030" {
		detected.Charset = "GB18030"
	}
	if detectedEnc, _ := charset.Lookup(detected.Charset); detectedEnc != nil {
		return detectedEnc
	}

	return enc
}
Example #2
0
func encodingReader(body []byte, contentType string) (io.Reader, error) {
	preview := make([]byte, 1024)
	var r io.Reader = bytes.NewReader(body)
	n, err := io.ReadFull(r, preview)
	switch {
	case err == io.ErrUnexpectedEOF:
		preview = preview[:n]
		r = bytes.NewReader(preview)
	case err != nil:
		return nil, err
	default:
		r = io.MultiReader(bytes.NewReader(preview), r)
	}

	e, _, certain := charset.DetermineEncoding(preview, contentType)
	if !certain && e == charmap.Windows1252 && utf8.Valid(body) {
		e = encoding.Nop
	}
	if e != encoding.Nop {
		r = transform.NewReader(r, e.NewDecoder())
	}
	return r, nil
}