func NewUTF8Reader(label string, r io.Reader) (io.Reader, error) { e, _ := charset.Lookup(label) if e == nil { return nil, fmt.Errorf("unsupported charset: %q", label) } return transform.NewReader(r, unicode.BOMOverride(e.NewDecoder())), nil }
func TestBOMOverride(t *testing.T) { dec := unicode.BOMOverride(charmap.CodePage437.NewDecoder()) dst := make([]byte, 100) for i, tc := range []struct { src string atEOF bool dst string nSrc int err error }{ 0: {"H\x82ll\x93", true, "Héllô", 5, nil}, 1: {"\uFEFFHéllö", true, "Héllö", 10, nil}, 2: {"\xFE\xFF\x00H\x00e\x00l\x00l\x00o", true, "Hello", 12, nil}, 3: {"\xFF\xFEH\x00e\x00l\x00l\x00o\x00", true, "Hello", 12, nil}, 4: {"\uFEFF", true, "", 3, nil}, 5: {"\xFE\xFF", true, "", 2, nil}, 6: {"\xFF\xFE", true, "", 2, nil}, 7: {"\xEF\xBB", true, "\u2229\u2557", 2, nil}, 8: {"\xEF", true, "\u2229", 1, nil}, 9: {"", true, "", 0, nil}, 10: {"\xFE", true, "\u25a0", 1, nil}, 11: {"\xFF", true, "\u00a0", 1, nil}, 12: {"\xEF\xBB", false, "", 0, transform.ErrShortSrc}, 13: {"\xEF", false, "", 0, transform.ErrShortSrc}, 14: {"", false, "", 0, transform.ErrShortSrc}, 15: {"\xFE", false, "", 0, transform.ErrShortSrc}, 16: {"\xFF", false, "", 0, transform.ErrShortSrc}, 17: {"\xFF\xFE", false, "", 0, transform.ErrShortSrc}, } { dec.Reset() nDst, nSrc, err := dec.Transform(dst, []byte(tc.src), tc.atEOF) got := string(dst[:nDst]) if nSrc != tc.nSrc { t.Errorf("%d: nSrc: got %d; want %d", i, nSrc, tc.nSrc) } if got != tc.dst { t.Errorf("%d: got %+q; want %+q", i, got, tc.dst) } if err != tc.err { t.Errorf("%d: error: got %v; want %v", i, err, tc.err) } } }
// Visit in a FileVisitor is just taking care of opening/closing files func (v *FileVisitor) Visit(fn VisitorFunc) error { var f *os.File if v.Path == constSTDINstr { f = os.Stdin } else { var err error if f, err = os.Open(v.Path); err != nil { return err } } defer f.Close() // TODO: Consider adding a flag to force to UTF16, apparently some // Windows tools don't write the BOM utf16bom := unicode.BOMOverride(unicode.UTF8.NewDecoder()) v.StreamVisitor.Reader = transform.NewReader(f, utf16bom) return v.StreamVisitor.Visit(fn) }
// NewReader wraps a Reader to decode Unicode to UTF-8 as it reads. func NewReader(r io.Reader, d EncodingHint) io.Reader { var decoder *encoding.Decoder switch d { case UTF8: // Make a transformer that assumes UTF-8 but abides by the BOM. decoder = unicode.UTF8.NewDecoder() case UTF16LE: // Make an tranformer that decodes MS-Windows (16LE) UTF files: winutf := unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM) // Make a transformer that is like winutf, but abides by BOM if found: decoder = winutf.NewDecoder() case UTF16BE: // Make an tranformer that decodes UTF-16BE files: utf16be := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM) // Make a transformer that is like utf16be, but abides by BOM if found: decoder = utf16be.NewDecoder() } // Make a Reader that uses utf16bom: return transform.NewReader(r, unicode.BOMOverride(decoder)) }
// Creates a scanner similar to os.Open() but decodes the file as UTF-16 // if the special byte order mark is present. func newScannerUTF16or8(filename string) (utfScanner, error) { // Read the file into a []byte: file, err := os.Open(filename) if err != nil { return nil, err } // Check for BOM marker := make([]byte, 2) numread, err := io.ReadAtLeast(file, marker, 2) file.Seek(0, 0) if numread == 2 && err == nil && ((marker[0] == 0xFE && marker[1] == 0xFF) || (marker[0] == 0xFF && marker[1] == 0xFE)) { // Make an tranformer that converts MS-Win default to UTF8: win16be := unicode.UTF16(unicode.BigEndian, unicode.UseBOM) // Make a transformer that is like win16be, but abides by BOM: utf16bom := unicode.BOMOverride(win16be.NewDecoder()) // Make a Reader that uses utf16bom: unicodeReader := transform.NewReader(file, utf16bom) return unicodeReader, nil } return file, nil }
func ConvToUTF8(b []byte, e encoding.Encoding) (result []byte, err error) { reader := transform.NewReader(bytes.NewReader(b), unicode.BOMOverride(e.NewDecoder())) return ioutil.ReadAll(reader) }
// newUnicodeReader wraps r to decode Unicode to UTF-8 as its reads. func newUnicodeReader(r io.Reader) io.Reader { // fallback to r if no BOM sequence is located in the source text. t := unicode.BOMOverride(transform.Nop) return transform.NewReader(r, t) }