// NewReader returns a reader which decode from the given encoding, to utf8. // // If enc is nil, then only an utf8-enforcing replacement reader // (see http://godoc.org/code.google.com/p/go.text/encoding#pkg-variables) // is used. func NewReader(r io.Reader, enc encoding.Encoding) io.Reader { if enc == nil || enc == encoding.Replacement { return transform.NewReader(r, encoding.Replacement.NewEncoder()) } return transform.NewReader(r, transform.Chain(enc.NewDecoder(), encoding.Replacement.NewEncoder())) }
// Encoding UTF-8 to Cp1251 and back func main() { var err error encoder := charmap.Windows1251.NewEncoder() decoder := charmap.Windows1251.NewDecoder() inUtf8 := "Ёжики пушистые 好 ἱερογλυφικὰ γράμματ" // inUtf8 := "Ёжики пушистые" sr := strings.NewReader(inUtf8) tr := transform.NewReader(sr, encoder) inCp1251, err := ioutil.ReadAll(tr) if err != nil { fmt.Println("Encoding error: ", err) } srBack := bytes.NewReader(inCp1251) trBack := transform.NewReader(srBack, decoder) outUtf8, err := ioutil.ReadAll(trBack) if err != nil { fmt.Println("Decoding error: ", err) } fmt.Println("Source UTF8:", inUtf8) fmt.Println("CP1251:", inCp1251, string(inCp1251)) fmt.Println("Result UTF8:", string(outUtf8)) fmt.Println(strings.Repeat("=", 80)) fmt.Println("Test https://github.com/fiam/gounidecode") fmt.Println("Original: ", inUtf8) fmt.Println("Translit: ", unidecode.Unidecode(inUtf8)) }
func newDecodeReader(r io.Reader, encoding int) io.Reader { switch encoding { case EUCJP: return transform.NewReader(r, japanese.EUCJP.NewDecoder()) case SHIFTJIS: return transform.NewReader(r, japanese.ShiftJIS.NewDecoder()) } return nil }
// DecodeTransfer decodes base64, quoted-printable or plain text. func decodeTransfer(r io.Reader, label string) io.Reader { switch strings.ToLower(label) { case "base64": return base64.NewDecoder(base64.StdEncoding, transform.NewReader(r, nonASCIITransformer{})) case "quoted-printable": return quotedprintable.NewReader(transform.NewReader(r, transform.Chain(nonASCIITransformer{}, newlineAppendTransformer{}))) case "", "7bit", "8bit", "binary": return r default: return failReader{fmt.Errorf("unsupported transfer encoding: %v", label)} } }
func New(name string) *Type2 { var typeable bool = true fi, err := os.Stat(name) if err != nil && os.IsNotExist(err) { typeable = false } if err == nil && fi.IsDir() { typeable = false } if !typeable { return &Type2{ Name: name, Typeable: false, } } file, err := os.Open(name) if err != nil { typeable = false } buf, err := ioutil.ReadAll(file) if err != nil { typeable = false } _, err = file.Seek(0, 0) if err != nil { typeable = false } if !typeable { return &Type2{ Name: name, Typeable: false, } } var reader io.Reader e := guess_jp(buf) switch e { case "Shift_JIS": reader = transform.NewReader(file, japanese.ShiftJIS.NewDecoder()) case "EUC-JP": reader = transform.NewReader(file, japanese.EUCJP.NewDecoder()) default: reader = file } return &Type2{ Name: name, Typeable: typeable, File: reader, } }
// Reader returns a new UTF-8 io.Reader for the response body. func (res *Response) Reader() (io.Reader, error) { enc, err := res.Encoding() if err != nil { return nil, err } return transform.NewReader(bytes.NewReader(res.Body), enc.NewDecoder()), nil }
// Shift-JIS -> UTF-8 func to_utf8(str string) (string, error) { body, err := ioutil.ReadAll(transform.NewReader(strings.NewReader(str), japanese.ShiftJIS.NewEncoder())) if err != nil { return "", err } var f []byte encodings := []string{"sjis", "utf-8"} for _, enc := range encodings { if enc != "" { ee, _ := charset.Lookup(enc) if ee == nil { continue } var buf bytes.Buffer ic := transform.NewWriter(&buf, ee.NewDecoder()) _, err := ic.Write(body) if err != nil { continue } err = ic.Close() if err != nil { continue } f = buf.Bytes() break } } return string(f), nil }
func decode_utf8(fixedHtml string) string { e := charmap.ISO8859_15 reader := strings.NewReader(fixedHtml) rInUTF8 := transform.NewReader(reader, e.NewDecoder()) return reader_to_str(rInUTF8) }
func decode(charset string, input io.Reader) (io.Reader, error) { if charset != "cp1251" { return nil, fmt.Errorf("unsupported charset") } return transform.NewReader(input, charmap.Windows1251.NewDecoder()), nil }
// TestNonRepertoire tests that codes outside of an Encoding's repertoire are // converted: // - to the Unicode replacement character '\ufffd' when decoding to UTF-8, // - to the ASCII substitute character '\x1a' when encoding from UTF-8. func TestNonRepertoire(t *testing.T) { testCases := []struct { e encoding.Encoding dSrc, eSrc string }{ {charmap.Windows1252, "\x81", "갂"}, {japanese.EUCJP, "\xfe\xfc", "갂"}, {japanese.ISO2022JP, "\x1b$B\x7e\x7e", "갂"}, {japanese.ShiftJIS, "\xef\xfc", "갂"}, {korean.EUCKR, "\xfe\xfe", "א"}, {simplifiedchinese.GBK, "\xfe\xfe", "갂"}, {simplifiedchinese.HZGB2312, "~{z~", "갂"}, {traditionalchinese.Big5, "\x81\x40", "갂"}, } for _, tc := range testCases { for _, direction := range []string{"Decode", "Encode"} { enc, want, src := (transform.Transformer)(nil), "", "" if direction == "Decode" { enc, want, src = tc.e.NewDecoder(), "\ufffd", tc.dSrc } else { enc, want, src = tc.e.NewEncoder(), "\x1a", tc.eSrc } dst, err := ioutil.ReadAll(transform.NewReader(strings.NewReader(src), enc)) if err != nil { t.Errorf("%s %v: %v", direction, tc.e, err) continue } if got := string(dst); got != want { t.Errorf("%s %v:\ngot %q\nwant %q", direction, tc.e, got, want) continue } } } }
func getReader(file *os.File) io.Reader { encoding := getEncoding() if encoding == nil { return file } return transform.NewReader(file, encoding.NewDecoder()) }
// EucToUtf8 convert euc encoded string to utf-8 encoded string func EucToUtf8(data string) (string, error) { in := bytes.NewBufferString(data) out := new(bytes.Buffer) reader := transform.NewReader(in, japanese.EUCJP.NewDecoder()) _, e := io.Copy(out, reader) return out.String(), e }
func TestReader(t *testing.T) { for _, tc := range sniffTestCases { content, err := ioutil.ReadFile("testdata/" + tc.filename) if err != nil { t.Errorf("%s: error reading file: %v", tc.filename, err) continue } r, err := NewReader(bytes.NewReader(content), tc.declared) if err != nil { t.Errorf("%s: error creating reader: %v", tc.filename, err) continue } got, err := ioutil.ReadAll(r) if err != nil { t.Errorf("%s: error reading from charset.NewReader: %v", tc.filename, err) continue } e, _ := Lookup(tc.want) want, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader(content), e.NewDecoder())) if err != nil { t.Errorf("%s: error decoding with hard-coded charset name: %v", tc.filename, err) continue } if !bytes.Equal(got, want) { t.Errorf("%s: got %q, want %q", tc.filename, got, want) continue } } }
func (s *SASLSuite) TestScramNormalizesPassword(c *C) { // From: libidn-1.9/tests/tst_stringprep.c // See RFC 4013, section 3 testCases := []struct { raw string normalized string }{ {"I\xC2\xADX", "IX"}, {"user", "user"}, {"USER", "USER"}, {"\xC2\xAA", "a"}, {"x\xC2\xADy", "xy"}, {"\xE2\x85\xA3", "IV"}, {"\xE2\x85\xA8", "IX"}, //They should error because they have forbidden chars //{"\x07", ""}, //should error //{"\xD8\xA71", ""}, //shold error } for _, test := range testCases { t := transform.NewReader(strings.NewReader(test.raw), Stringprep) r := bufio.NewReader(t) normalized, _, err := r.ReadLine() c.Check(err, IsNil) c.Check(normalized, DeepEquals, []byte(test.normalized)) } }
func getReader(reader io.Reader) io.Reader { encoding := getEncoding() if encoding == nil { return reader } return transform.NewReader(reader, encoding.NewDecoder()) }
func getGbkDoc(client *http.Client, url string) (*goquery.Document, error) { retry := 3 get: resp, err := client.Get(url) if err != nil { if retry > 0 { retry-- goto get } else { return nil, me(err, "get") } } defer resp.Body.Close() r := transform.NewReader(resp.Body, simplifiedchinese.GBK.NewDecoder()) doc, err := goquery.NewDocumentFromReader(r) if err != nil { if retry > 0 { retry-- goto get } else { return nil, me(err, "new document from response") } } return doc, nil }
func (s *SASLSuite) TestScramNormalizesPassword(c *C) { // From: libidn-1.9/tests/tst_stringprep.c // See RFC 4013, section 3 testCases := []struct { raw string normalized string }{ {"I\xC2\xADX", "IX"}, {"user", "user"}, {"USER", "USER"}, {"user\u200B", "user "}, {"user\u2002", "user "}, {"\xC2\xAA", "a"}, {"x\xC2\xADy", "xy"}, {"\xE2\x85\xA3", "IV"}, {"\xE2\x85\xA8", "IX"}, {"\u034F\u1806\u180Bb\u180C\u180Dy\u200Ct\u200D\u2060\uFE00e\uFE01\uFE02\uFE03\uFE04\uFE05\uFE06\uFE07\uFE08\uFE09\uFE0A\uFE0B\uFE0C\uFE0D\uFE0E\uFE0F\uFEFF", "byte"}, //They should error because they have forbidden chars //{"\x07", ""}, //should error //{"\xD8\xA71", ""}, //shold error } for _, test := range testCases { t := transform.NewReader(strings.NewReader(test.raw), Stringprep) r := bufio.NewReader(t) normalized, _, err := r.ReadLine() c.Check(err, IsNil) c.Check(string(normalized), DeepEquals, test.normalized) } }
// NewReaderLabel returns a reader that converts from the specified charset to // UTF-8. It uses Lookup to find the encoding that corresponds to label, and // returns an error if Lookup returns nil. It is suitable for use as // encoding/xml.Decoder's CharsetReader function. func NewReaderLabel(label string, input io.Reader) (io.Reader, error) { e, _ := Lookup(label) if e == nil { return nil, fmt.Errorf("unsupported charset: %q", label) } return transform.NewReader(input, e.NewDecoder()), nil }
func main() { t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC) r := transform.NewReader(os.Stdin, t) if _, err := io.Copy(os.Stdout, r); err != nil { log.Fatal(err) } }
func defaultCharsetReader(cs string, input io.Reader) (io.Reader, error) { e, _ := charset.Lookup(cs) if e == nil { return nil, fmt.Errorf("cannot decode charset %v", cs) } return transform.NewReader(input, e.NewDecoder()), nil }
func ParseFeed(c appengine.Context, contentType, origUrl, fetchUrl string, body []byte) (*Feed, []*Story, error) { cr := defaultCharsetReader if !bytes.EqualFold(body[:len(xml.Header)], []byte(xml.Header)) { enc, err := encodingReader(body, contentType) if err != nil { return nil, nil, err } if enc != encoding.Nop { cr = nilCharsetReader body, err = ioutil.ReadAll(transform.NewReader(bytes.NewReader(body), enc.NewDecoder())) if err != nil { return nil, nil, err } } } var feed *Feed var stories []*Story var atomerr, rsserr, rdferr error feed, stories, atomerr = parseAtom(c, body, cr) if feed == nil { feed, stories, rsserr = parseRSS(c, body, cr) } if feed == nil { feed, stories, rdferr = parseRDF(c, body, cr) } if feed == nil { c.Warningf("atom parse error: %s", atomerr.Error()) c.Warningf("xml parse error: %s", rsserr.Error()) c.Warningf("rdf parse error: %s", rdferr.Error()) return nil, nil, fmt.Errorf("Could not parse feed data") } feed.Url = origUrl return parseFix(c, feed, stories, fetchUrl) }
//Read read csv for handle func ReadLines(file string, isGbk bool) (lines [][]string, err error) { //catch panic defer func() { if rerr := recover(); rerr != nil { err = errors.New(fmt.Sprintf("read csv file: %v, error: %v", file, rerr)) } }() //open file fi, err := os.Open(file) if err != nil { return nil, err } defer fi.Close() //get reader var reader *csv.Reader if !isGbk { reader = csv.NewReader(fi) } else { //transform gbk to utf8 r := transform.NewReader(fi, simplifiedchinese.GBK.NewDecoder()) reader = csv.NewReader(r) } lines, err = reader.ReadAll() return }
// NewReaderByName returns a reader that converts from the specified charset to // UTF-8. It returns an error if the charset is not one of the standard // encodings for HTML. It is suitable for use as encoding/xml.Decoder's // CharsetReader function. func NewReaderByName(charset string, input io.Reader) (io.Reader, error) { e, _ := Lookup(charset) if e == nil { return nil, fmt.Errorf("unsupported charset: %q", charset) } return transform.NewReader(input, e.NewDecoder()), nil }
func Decode(src string) (dst string) { data, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(src)), simplifiedchinese.GBK.NewEncoder())) if err == nil { dst = string(data) } return }
func NewUTF8Reader(label string, r io.Reader) (io.Reader, error) { e, _ := charset.Lookup(label) if e == nil { return nil, fmt.Errorf("unsupported charset: %q", label) } return transform.NewReader(r, unicode.BOMOverride(e.NewDecoder())), nil }
func charsetReader(charset string, input io.Reader) (io.Reader, error) { // Windows-1252 is a superset of ISO-8859-1. if strings.ToLower(charset) == "iso-8859-1" { return transform.NewReader(input, charmap.Windows1252.NewDecoder()), nil } return nil, fmt.Errorf("unsupported charset: %q", charset) }
func decodeRowsHtmlIsracard(r io.Reader) (rows []Row, err error) { log.Print("isracard") rInUTF8 := transform.NewReader(r, charmap.ISO8859_8I.NewDecoder()) doc, err := goquery.NewDocumentFromReader(rInUTF8) if err != nil { return nil, err } done := false doc.Find("tr").Each(func(i int, s *goquery.Selection) { if i < 3 || i == 4 { return } isHeader := (len(rows) == 0) var row Row s.Find("td").Each(func(i int, s *goquery.Selection) { value := strings.TrimSpace(s.Text()) if isHeader { if value == "" { value = fmt.Sprintf("field_%v", i) } } // log.Printf("Field: %v [%v]", value, i) row = append(row, value) }) if len(row) < 2 { if !done { log.Println("Heuristic: wrong file format, giving up") rows = nil done = true } return } if row[0] == "" { row[0] = "1970-01-01" } if row[1] == "@סך חיוב בש\"ח:" { done = true } row[1] = strings.TrimPrefix(row[1], "\u200f") if !done { log.Printf("%#v", row) rows = append(rows, row) } }) return rows, nil }
func bodyToUTF8(body []byte, contentType string) (*transform.Reader, error) { enc, _, _ := charset.DetermineEncoding(body, contentType) if enc == encoding.Nop { return nil, werrors.New(ErrEncodingNotFound) } return transform.NewReader(bytes.NewReader(body), enc.NewDecoder()), nil }
func (s *KittychanInfoSource) ScrapeFromReader(reader io.Reader) (*feeds.Feed, error) { decodedReader := transform.NewReader(reader, japanese.ShiftJIS.NewDecoder()) doc, err := goquery.NewDocumentFromReader(decodedReader) if err != nil { return nil, err } return s.ScrapeFromDocument(doc) }
func Utf8ToGbk(s []byte) ([]byte, error) { reader := transform.NewReader(bytes.NewReader(s), simplifiedchinese.GBK.NewEncoder()) d, e := ioutil.ReadAll(reader) if e != nil { return nil, e } return d, nil }