Beispiel #1
0
func defaultCharsetReader(cs string, input io.Reader) (io.Reader, error) {
	e, _ := charset.Lookup(cs)
	if e == nil {
		return nil, fmt.Errorf("cannot decode charset %v", cs)
	}
	return transform.NewReader(input, e.NewDecoder()), nil
}
Beispiel #2
0
func ParseFeed(c appengine.Context, contentType, origUrl, fetchUrl string, body []byte) (*Feed, []*Story, error) {
	cr := defaultCharsetReader
	if !bytes.EqualFold(body[:len(xml.Header)], []byte(xml.Header)) {
		enc, err := encodingReader(body, contentType)
		if err != nil {
			return nil, nil, err
		}
		if enc != encoding.Nop {
			cr = nilCharsetReader
			body, err = ioutil.ReadAll(transform.NewReader(bytes.NewReader(body), enc.NewDecoder()))
			if err != nil {
				return nil, nil, err
			}
		}
	}
	var feed *Feed
	var stories []*Story
	var atomerr, rsserr, rdferr error
	feed, stories, atomerr = parseAtom(c, body, cr)
	if feed == nil {
		feed, stories, rsserr = parseRSS(c, body, cr)
	}
	if feed == nil {
		feed, stories, rdferr = parseRDF(c, body, cr)
	}
	if feed == nil {
		c.Warningf("atom parse error: %s", atomerr.Error())
		c.Warningf("xml parse error: %s", rsserr.Error())
		c.Warningf("rdf parse error: %s", rdferr.Error())
		return nil, nil, fmt.Errorf("Could not parse feed data")
	}
	feed.Url = origUrl
	return parseFix(c, feed, stories, fetchUrl)
}
Beispiel #3
0
// TestNonRepertoire tests that codes outside of an Encoding's repertoire are
// converted:
//   - to the Unicode replacement character '\ufffd' when decoding to UTF-8,
//   - to the ASCII substitute character '\x1a' when encoding from UTF-8.
func TestNonRepertoire(t *testing.T) {
	testCases := []struct {
		e          encoding.Encoding
		dSrc, eSrc string
	}{
		{charmap.Windows1252, "\x81", "갂"},
		{japanese.EUCJP, "\xfe\xfc", "갂"},
		{japanese.ISO2022JP, "\x1b$B\x7e\x7e", "갂"},
		{japanese.ShiftJIS, "\xef\xfc", "갂"},
		{korean.EUCKR, "\xfe\xfe", "א"},
		{simplifiedchinese.GBK, "\xfe\xfe", "갂"},
		{simplifiedchinese.HZGB2312, "~{z~", "갂"},
		{traditionalchinese.Big5, "\x81\x40", "갂"},
	}
	for _, tc := range testCases {
		for _, direction := range []string{"Decode", "Encode"} {
			enc, want, src := (transform.Transformer)(nil), "", ""
			if direction == "Decode" {
				enc, want, src = tc.e.NewDecoder(), "\ufffd", tc.dSrc
			} else {
				enc, want, src = tc.e.NewEncoder(), "\x1a", tc.eSrc
			}

			dst, err := ioutil.ReadAll(transform.NewReader(strings.NewReader(src), enc))
			if err != nil {
				t.Errorf("%s %v: %v", direction, tc.e, err)
				continue
			}
			if got := string(dst); got != want {
				t.Errorf("%s %v:\ngot  %q\nwant %q", direction, tc.e, got, want)
				continue
			}
		}
	}
}
Beispiel #4
0
// NewReaderLabel returns a reader that converts from the specified charset to
// UTF-8. It uses Lookup to find the encoding that corresponds to label, and
// returns an error if Lookup returns nil. It is suitable for use as
// encoding/xml.Decoder's CharsetReader function.
func NewReaderLabel(label string, input io.Reader) (io.Reader, error) {
	e, _ := Lookup(label)
	if e == nil {
		return nil, fmt.Errorf("unsupported charset: %q", label)
	}
	return transform.NewReader(input, e.NewDecoder()), nil
}
Beispiel #5
0
func TestBasics(t *testing.T) {
	for _, tc := range basicTestCases {
		for _, direction := range []string{"Decode", "Encode"} {
			newTransformer, want, src := (func() transform.Transformer)(nil), "", ""
			wPrefix, sPrefix, wSuffix, sSuffix := "", "", "", ""
			if direction == "Decode" {
				newTransformer, want, src = tc.e.NewDecoder, tc.utf8, tc.encoded
				wPrefix, sPrefix, wSuffix, sSuffix = "", tc.encPrefix, "", tc.encSuffix
			} else {
				newTransformer, want, src = tc.e.NewEncoder, tc.encoded, tc.utf8
				wPrefix, sPrefix, wSuffix, sSuffix = tc.encPrefix, "", tc.encSuffix, ""
			}

			dst := make([]byte, len(wPrefix)+len(want)+len(wSuffix))
			nDst, nSrc, err := newTransformer().Transform(dst, []byte(sPrefix+src+sSuffix), true)
			if err != nil {
				t.Errorf("%v: %s: %v", tc.e, direction, err)
				continue
			}
			if nDst != len(wPrefix)+len(want)+len(wSuffix) {
				t.Errorf("%v: %s: nDst got %d, want %d",
					tc.e, direction, nDst, len(wPrefix)+len(want)+len(wSuffix))
				continue
			}
			if nSrc != len(sPrefix)+len(src)+len(sSuffix) {
				t.Errorf("%v: %s: nSrc got %d, want %d",
					tc.e, direction, nSrc, len(sPrefix)+len(src)+len(sSuffix))
				continue
			}
			if got := string(dst); got != wPrefix+want+wSuffix {
				t.Errorf("%v: %s:\ngot  %q\nwant %q",
					tc.e, direction, got, wPrefix+want+wSuffix)
				continue
			}

			for _, n := range []int{0, 1, 2, 10, 123, 4567} {
				input := sPrefix + strings.Repeat(src, n) + sSuffix
				sr := strings.NewReader(input)
				g, err := ioutil.ReadAll(transform.NewReader(sr, newTransformer()))
				if err != nil {
					t.Errorf("%v: %s: ReadAll: n=%d: %v", tc.e, direction, n, err)
					continue
				}
				if len(g) == 0 && len(input) == 0 {
					// If the input is empty then the output can be empty,
					// regardless of whatever wPrefix is.
					continue
				}
				got1, want1 := string(g), wPrefix+strings.Repeat(want, n)+wSuffix
				if got1 != want1 {
					t.Errorf("%v: %s: ReadAll: n=%d\ngot  %q\nwant %q",
						tc.e, direction, n, trim(got1), trim(want1))
					continue
				}
			}
		}
	}
}
Beispiel #6
0
func benchmark(b *testing.B, direction string, enc encoding.Encoding) {
	_, src, newTransformer, err := load(direction, enc)
	if err != nil {
		b.Fatal(err)
	}
	b.SetBytes(int64(len(src)))
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		r := transform.NewReader(bytes.NewReader(src), newTransformer())
		io.Copy(ioutil.Discard, r)
	}
}
Beispiel #7
0
// TestBig5CircumflexAndMacron tests the special cases listed in
// http://encoding.spec.whatwg.org/#big5
// Note that these special cases aren't preserved by round-tripping through
// decoding and encoding (since
// http://encoding.spec.whatwg.org/index-big5.txt does not have an entry for
// U+0304 or U+030C), so we can't test this in TestBasics.
func TestBig5CircumflexAndMacron(t *testing.T) {
	src := "\x88\x5f\x88\x60\x88\x61\x88\x62\x88\x63\x88\x64\x88\x65\x88\x66 " +
		"\x88\xa2\x88\xa3\x88\xa4\x88\xa5\x88\xa6"
	want := "ÓǑÒ\u00ca\u0304Ế\u00ca\u030cỀÊ " +
		"ü\u00ea\u0304ế\u00ea\u030cề"
	dst, err := ioutil.ReadAll(transform.NewReader(
		strings.NewReader(src), traditionalchinese.Big5.NewDecoder()))
	if err != nil {
		t.Fatal(err)
	}
	if got := string(dst); got != want {
		t.Fatalf("\ngot  %q\nwant %q", got, want)
	}
}
Beispiel #8
0
// NewReader returns an io.Reader that converts the content of r to UTF-8.
// It calls DetermineEncoding to find out what r's encoding is.
func NewReader(r io.Reader, contentType string) (io.Reader, error) {
	preview := make([]byte, 1024)
	n, err := io.ReadFull(r, preview)
	switch {
	case err == io.ErrUnexpectedEOF:
		preview = preview[:n]
		r = bytes.NewReader(preview)
	case err != nil:
		return nil, err
	default:
		r = io.MultiReader(bytes.NewReader(preview), r)
	}

	if e, _, _ := DetermineEncoding(preview, contentType); e != encoding.Nop {
		r = transform.NewReader(r, e.NewDecoder())
	}
	return r, nil
}
Beispiel #9
0
func TestFiles(t *testing.T) {
	for _, dir := range []string{"Decode", "Encode"} {
		for _, tf := range testdataFiles {
			dst, src, newTransformer, err := load(dir, tf.enc)
			if err != nil {
				t.Errorf("%s, %s: load: %v", dir, tf.enc, err)
				continue
			}
			buf := bytes.NewBuffer(nil)
			r := transform.NewReader(bytes.NewReader(src), newTransformer())
			if _, err := io.Copy(buf, r); err != nil {
				t.Errorf("%s, %s: copy: %v", dir, tf.enc, err)
				continue
			}
			if !bytes.Equal(buf.Bytes(), dst) {
				t.Errorf("%s, %s: transformed bytes did not match golden file", dir, tf.enc)
				continue
			}
		}
	}
}
Beispiel #10
0
func TestReplacement(t *testing.T) {
	for _, direction := range []string{"Decode", "Encode"} {
		enc, want := (transform.Transformer)(nil), ""
		if direction == "Decode" {
			enc = encoding.Replacement.NewDecoder()
			want = "\ufffd"
		} else {
			enc = encoding.Replacement.NewEncoder()
			want = "AB\x00CD\ufffdYZ"
		}
		sr := strings.NewReader("AB\x00CD\x80YZ")
		g, err := ioutil.ReadAll(transform.NewReader(sr, enc))
		if err != nil {
			t.Errorf("%s: ReadAll: %v", direction, err)
			continue
		}
		if got := string(g); got != want {
			t.Errorf("%s:\ngot  %q\nwant %q", direction, got, want)
			continue
		}
	}
}
Beispiel #11
0
func TestReader(t *testing.T) {
	switch runtime.GOOS {
	case "nacl": // platforms that don't permit direct file system access
		t.Skipf("not supported on %q", runtime.GOOS)
	}

	for _, tc := range sniffTestCases {
		content, err := ioutil.ReadFile("testdata/" + tc.filename)
		if err != nil {
			t.Errorf("%s: error reading file: %v", tc.filename, err)
			continue
		}

		r, err := NewReader(bytes.NewReader(content), tc.declared)
		if err != nil {
			t.Errorf("%s: error creating reader: %v", tc.filename, err)
			continue
		}

		got, err := ioutil.ReadAll(r)
		if err != nil {
			t.Errorf("%s: error reading from charset.NewReader: %v", tc.filename, err)
			continue
		}

		e, _ := Lookup(tc.want)
		want, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader(content), e.NewDecoder()))
		if err != nil {
			t.Errorf("%s: error decoding with hard-coded charset name: %v", tc.filename, err)
			continue
		}

		if !bytes.Equal(got, want) {
			t.Errorf("%s: got %q, want %q", tc.filename, got, want)
			continue
		}
	}
}
Beispiel #12
0
func transformString(t transform.Transformer, s string) (string, error) {
	r := transform.NewReader(strings.NewReader(s), t)
	b, err := ioutil.ReadAll(r)
	return string(b), err
}
Beispiel #13
0
func ExampleDecodeWindows1252() {
	sr := strings.NewReader("Gar\xe7on !")
	tr := transform.NewReader(sr, charmap.Windows1252.NewDecoder())
	io.Copy(os.Stdout, tr)
	// Output: Garçon !
}