func defaultCharsetReader(cs string, input io.Reader) (io.Reader, error) { e, _ := charset.Lookup(cs) if e == nil { return nil, fmt.Errorf("cannot decode charset %v", cs) } return transform.NewReader(input, e.NewDecoder()), nil }
func ParseFeed(c appengine.Context, contentType, origUrl, fetchUrl string, body []byte) (*Feed, []*Story, error) { cr := defaultCharsetReader if !bytes.EqualFold(body[:len(xml.Header)], []byte(xml.Header)) { enc, err := encodingReader(body, contentType) if err != nil { return nil, nil, err } if enc != encoding.Nop { cr = nilCharsetReader body, err = ioutil.ReadAll(transform.NewReader(bytes.NewReader(body), enc.NewDecoder())) if err != nil { return nil, nil, err } } } var feed *Feed var stories []*Story var atomerr, rsserr, rdferr error feed, stories, atomerr = parseAtom(c, body, cr) if feed == nil { feed, stories, rsserr = parseRSS(c, body, cr) } if feed == nil { feed, stories, rdferr = parseRDF(c, body, cr) } if feed == nil { c.Warningf("atom parse error: %s", atomerr.Error()) c.Warningf("xml parse error: %s", rsserr.Error()) c.Warningf("rdf parse error: %s", rdferr.Error()) return nil, nil, fmt.Errorf("Could not parse feed data") } feed.Url = origUrl return parseFix(c, feed, stories, fetchUrl) }
// TestNonRepertoire tests that codes outside of an Encoding's repertoire are // converted: // - to the Unicode replacement character '\ufffd' when decoding to UTF-8, // - to the ASCII substitute character '\x1a' when encoding from UTF-8. func TestNonRepertoire(t *testing.T) { testCases := []struct { e encoding.Encoding dSrc, eSrc string }{ {charmap.Windows1252, "\x81", "갂"}, {japanese.EUCJP, "\xfe\xfc", "갂"}, {japanese.ISO2022JP, "\x1b$B\x7e\x7e", "갂"}, {japanese.ShiftJIS, "\xef\xfc", "갂"}, {korean.EUCKR, "\xfe\xfe", "א"}, {simplifiedchinese.GBK, "\xfe\xfe", "갂"}, {simplifiedchinese.HZGB2312, "~{z~", "갂"}, {traditionalchinese.Big5, "\x81\x40", "갂"}, } for _, tc := range testCases { for _, direction := range []string{"Decode", "Encode"} { enc, want, src := (transform.Transformer)(nil), "", "" if direction == "Decode" { enc, want, src = tc.e.NewDecoder(), "\ufffd", tc.dSrc } else { enc, want, src = tc.e.NewEncoder(), "\x1a", tc.eSrc } dst, err := ioutil.ReadAll(transform.NewReader(strings.NewReader(src), enc)) if err != nil { t.Errorf("%s %v: %v", direction, tc.e, err) continue } if got := string(dst); got != want { t.Errorf("%s %v:\ngot %q\nwant %q", direction, tc.e, got, want) continue } } } }
// NewReaderLabel returns a reader that converts from the specified charset to // UTF-8. It uses Lookup to find the encoding that corresponds to label, and // returns an error if Lookup returns nil. It is suitable for use as // encoding/xml.Decoder's CharsetReader function. func NewReaderLabel(label string, input io.Reader) (io.Reader, error) { e, _ := Lookup(label) if e == nil { return nil, fmt.Errorf("unsupported charset: %q", label) } return transform.NewReader(input, e.NewDecoder()), nil }
func TestBasics(t *testing.T) { for _, tc := range basicTestCases { for _, direction := range []string{"Decode", "Encode"} { newTransformer, want, src := (func() transform.Transformer)(nil), "", "" wPrefix, sPrefix, wSuffix, sSuffix := "", "", "", "" if direction == "Decode" { newTransformer, want, src = tc.e.NewDecoder, tc.utf8, tc.encoded wPrefix, sPrefix, wSuffix, sSuffix = "", tc.encPrefix, "", tc.encSuffix } else { newTransformer, want, src = tc.e.NewEncoder, tc.encoded, tc.utf8 wPrefix, sPrefix, wSuffix, sSuffix = tc.encPrefix, "", tc.encSuffix, "" } dst := make([]byte, len(wPrefix)+len(want)+len(wSuffix)) nDst, nSrc, err := newTransformer().Transform(dst, []byte(sPrefix+src+sSuffix), true) if err != nil { t.Errorf("%v: %s: %v", tc.e, direction, err) continue } if nDst != len(wPrefix)+len(want)+len(wSuffix) { t.Errorf("%v: %s: nDst got %d, want %d", tc.e, direction, nDst, len(wPrefix)+len(want)+len(wSuffix)) continue } if nSrc != len(sPrefix)+len(src)+len(sSuffix) { t.Errorf("%v: %s: nSrc got %d, want %d", tc.e, direction, nSrc, len(sPrefix)+len(src)+len(sSuffix)) continue } if got := string(dst); got != wPrefix+want+wSuffix { t.Errorf("%v: %s:\ngot %q\nwant %q", tc.e, direction, got, wPrefix+want+wSuffix) continue } for _, n := range []int{0, 1, 2, 10, 123, 4567} { input := sPrefix + strings.Repeat(src, n) + sSuffix sr := strings.NewReader(input) g, err := ioutil.ReadAll(transform.NewReader(sr, newTransformer())) if err != nil { t.Errorf("%v: %s: ReadAll: n=%d: %v", tc.e, direction, n, err) continue } if len(g) == 0 && len(input) == 0 { // If the input is empty then the output can be empty, // regardless of whatever wPrefix is. continue } got1, want1 := string(g), wPrefix+strings.Repeat(want, n)+wSuffix if got1 != want1 { t.Errorf("%v: %s: ReadAll: n=%d\ngot %q\nwant %q", tc.e, direction, n, trim(got1), trim(want1)) continue } } } } }
func benchmark(b *testing.B, direction string, enc encoding.Encoding) { _, src, newTransformer, err := load(direction, enc) if err != nil { b.Fatal(err) } b.SetBytes(int64(len(src))) b.ResetTimer() for i := 0; i < b.N; i++ { r := transform.NewReader(bytes.NewReader(src), newTransformer()) io.Copy(ioutil.Discard, r) } }
// TestBig5CircumflexAndMacron tests the special cases listed in // http://encoding.spec.whatwg.org/#big5 // Note that these special cases aren't preserved by round-tripping through // decoding and encoding (since // http://encoding.spec.whatwg.org/index-big5.txt does not have an entry for // U+0304 or U+030C), so we can't test this in TestBasics. func TestBig5CircumflexAndMacron(t *testing.T) { src := "\x88\x5f\x88\x60\x88\x61\x88\x62\x88\x63\x88\x64\x88\x65\x88\x66 " + "\x88\xa2\x88\xa3\x88\xa4\x88\xa5\x88\xa6" want := "ÓǑÒ\u00ca\u0304Ế\u00ca\u030cỀÊ " + "ü\u00ea\u0304ế\u00ea\u030cề" dst, err := ioutil.ReadAll(transform.NewReader( strings.NewReader(src), traditionalchinese.Big5.NewDecoder())) if err != nil { t.Fatal(err) } if got := string(dst); got != want { t.Fatalf("\ngot %q\nwant %q", got, want) } }
// NewReader returns an io.Reader that converts the content of r to UTF-8. // It calls DetermineEncoding to find out what r's encoding is. func NewReader(r io.Reader, contentType string) (io.Reader, error) { preview := make([]byte, 1024) n, err := io.ReadFull(r, preview) switch { case err == io.ErrUnexpectedEOF: preview = preview[:n] r = bytes.NewReader(preview) case err != nil: return nil, err default: r = io.MultiReader(bytes.NewReader(preview), r) } if e, _, _ := DetermineEncoding(preview, contentType); e != encoding.Nop { r = transform.NewReader(r, e.NewDecoder()) } return r, nil }
func TestFiles(t *testing.T) { for _, dir := range []string{"Decode", "Encode"} { for _, tf := range testdataFiles { dst, src, newTransformer, err := load(dir, tf.enc) if err != nil { t.Errorf("%s, %s: load: %v", dir, tf.enc, err) continue } buf := bytes.NewBuffer(nil) r := transform.NewReader(bytes.NewReader(src), newTransformer()) if _, err := io.Copy(buf, r); err != nil { t.Errorf("%s, %s: copy: %v", dir, tf.enc, err) continue } if !bytes.Equal(buf.Bytes(), dst) { t.Errorf("%s, %s: transformed bytes did not match golden file", dir, tf.enc) continue } } } }
func TestReplacement(t *testing.T) { for _, direction := range []string{"Decode", "Encode"} { enc, want := (transform.Transformer)(nil), "" if direction == "Decode" { enc = encoding.Replacement.NewDecoder() want = "\ufffd" } else { enc = encoding.Replacement.NewEncoder() want = "AB\x00CD\ufffdYZ" } sr := strings.NewReader("AB\x00CD\x80YZ") g, err := ioutil.ReadAll(transform.NewReader(sr, enc)) if err != nil { t.Errorf("%s: ReadAll: %v", direction, err) continue } if got := string(g); got != want { t.Errorf("%s:\ngot %q\nwant %q", direction, got, want) continue } } }
func TestReader(t *testing.T) { switch runtime.GOOS { case "nacl": // platforms that don't permit direct file system access t.Skipf("not supported on %q", runtime.GOOS) } for _, tc := range sniffTestCases { content, err := ioutil.ReadFile("testdata/" + tc.filename) if err != nil { t.Errorf("%s: error reading file: %v", tc.filename, err) continue } r, err := NewReader(bytes.NewReader(content), tc.declared) if err != nil { t.Errorf("%s: error creating reader: %v", tc.filename, err) continue } got, err := ioutil.ReadAll(r) if err != nil { t.Errorf("%s: error reading from charset.NewReader: %v", tc.filename, err) continue } e, _ := Lookup(tc.want) want, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader(content), e.NewDecoder())) if err != nil { t.Errorf("%s: error decoding with hard-coded charset name: %v", tc.filename, err) continue } if !bytes.Equal(got, want) { t.Errorf("%s: got %q, want %q", tc.filename, got, want) continue } } }
func transformString(t transform.Transformer, s string) (string, error) { r := transform.NewReader(strings.NewReader(s), t) b, err := ioutil.ReadAll(r) return string(b), err }
func ExampleDecodeWindows1252() { sr := strings.NewReader("Gar\xe7on !") tr := transform.NewReader(sr, charmap.Windows1252.NewDecoder()) io.Copy(os.Stdout, tr) // Output: Garçon ! }