Exemple #1
0
// NewReader returns a reader which decode from the given encoding, to utf8.
//
// If enc is nil, then only an utf8-enforcing replacement reader
// (see http://godoc.org/code.google.com/p/go.text/encoding#pkg-variables)
// is used.
func NewReader(r io.Reader, enc encoding.Encoding) io.Reader {
	if enc == nil || enc == encoding.Replacement {
		return transform.NewReader(r, encoding.Replacement.NewEncoder())
	}
	return transform.NewReader(r,
		transform.Chain(enc.NewDecoder(), encoding.Replacement.NewEncoder()))
}
Exemple #2
0
// Encoding UTF-8 to Cp1251 and back
func main() {
	var err error

	encoder := charmap.Windows1251.NewEncoder()
	decoder := charmap.Windows1251.NewDecoder()

	inUtf8 := "Ёжики пушистые 好 ἱερογλυφικὰ γράμματ"
	// inUtf8 := "Ёжики пушистые"

	sr := strings.NewReader(inUtf8)
	tr := transform.NewReader(sr, encoder)
	inCp1251, err := ioutil.ReadAll(tr)
	if err != nil {
		fmt.Println("Encoding error: ", err)
	}

	srBack := bytes.NewReader(inCp1251)
	trBack := transform.NewReader(srBack, decoder)
	outUtf8, err := ioutil.ReadAll(trBack)
	if err != nil {
		fmt.Println("Decoding error: ", err)
	}

	fmt.Println("Source UTF8:", inUtf8)
	fmt.Println("CP1251:", inCp1251, string(inCp1251))
	fmt.Println("Result UTF8:", string(outUtf8))

	fmt.Println(strings.Repeat("=", 80))
	fmt.Println("Test https://github.com/fiam/gounidecode")
	fmt.Println("Original: ", inUtf8)
	fmt.Println("Translit: ", unidecode.Unidecode(inUtf8))
}
func newDecodeReader(r io.Reader, encoding int) io.Reader {
	switch encoding {
	case EUCJP:
		return transform.NewReader(r, japanese.EUCJP.NewDecoder())
	case SHIFTJIS:
		return transform.NewReader(r, japanese.ShiftJIS.NewDecoder())
	}
	return nil
}
Exemple #4
0
// DecodeTransfer decodes base64, quoted-printable or plain text.
func decodeTransfer(r io.Reader, label string) io.Reader {
	switch strings.ToLower(label) {
	case "base64":
		return base64.NewDecoder(base64.StdEncoding, transform.NewReader(r, nonASCIITransformer{}))
	case "quoted-printable":
		return quotedprintable.NewReader(transform.NewReader(r, transform.Chain(nonASCIITransformer{}, newlineAppendTransformer{})))
	case "", "7bit", "8bit", "binary":
		return r
	default:
		return failReader{fmt.Errorf("unsupported transfer encoding: %v", label)}
	}
}
Exemple #5
0
func New(name string) *Type2 {
	var typeable bool = true
	fi, err := os.Stat(name)
	if err != nil && os.IsNotExist(err) {
		typeable = false
	}
	if err == nil && fi.IsDir() {
		typeable = false
	}
	if !typeable {
		return &Type2{
			Name:     name,
			Typeable: false,
		}
	}

	file, err := os.Open(name)
	if err != nil {
		typeable = false
	}
	buf, err := ioutil.ReadAll(file)
	if err != nil {
		typeable = false
	}
	_, err = file.Seek(0, 0)
	if err != nil {
		typeable = false
	}
	if !typeable {
		return &Type2{
			Name:     name,
			Typeable: false,
		}
	}

	var reader io.Reader
	e := guess_jp(buf)
	switch e {
	case "Shift_JIS":
		reader = transform.NewReader(file, japanese.ShiftJIS.NewDecoder())
	case "EUC-JP":
		reader = transform.NewReader(file, japanese.EUCJP.NewDecoder())
	default:
		reader = file
	}
	return &Type2{
		Name:     name,
		Typeable: typeable,
		File:     reader,
	}
}
Exemple #6
0
// Reader returns a new UTF-8 io.Reader for the response body.
func (res *Response) Reader() (io.Reader, error) {
	enc, err := res.Encoding()
	if err != nil {
		return nil, err
	}
	return transform.NewReader(bytes.NewReader(res.Body), enc.NewDecoder()), nil
}
// Shift-JIS -> UTF-8
func to_utf8(str string) (string, error) {
	body, err := ioutil.ReadAll(transform.NewReader(strings.NewReader(str), japanese.ShiftJIS.NewEncoder()))
	if err != nil {
		return "", err
	}

	var f []byte
	encodings := []string{"sjis", "utf-8"}
	for _, enc := range encodings {
		if enc != "" {
			ee, _ := charset.Lookup(enc)
			if ee == nil {
				continue
			}
			var buf bytes.Buffer
			ic := transform.NewWriter(&buf, ee.NewDecoder())
			_, err := ic.Write(body)
			if err != nil {
				continue
			}
			err = ic.Close()
			if err != nil {
				continue
			}
			f = buf.Bytes()
			break
		}
	}
	return string(f), nil
}
Exemple #8
0
func decode_utf8(fixedHtml string) string {
	e := charmap.ISO8859_15
	reader := strings.NewReader(fixedHtml)
	rInUTF8 := transform.NewReader(reader, e.NewDecoder())

	return reader_to_str(rInUTF8)
}
Exemple #9
0
func decode(charset string, input io.Reader) (io.Reader, error) {
	if charset != "cp1251" {
		return nil, fmt.Errorf("unsupported charset")
	}

	return transform.NewReader(input, charmap.Windows1251.NewDecoder()), nil
}
Exemple #10
0
// TestNonRepertoire tests that codes outside of an Encoding's repertoire are
// converted:
//   - to the Unicode replacement character '\ufffd' when decoding to UTF-8,
//   - to the ASCII substitute character '\x1a' when encoding from UTF-8.
func TestNonRepertoire(t *testing.T) {
	testCases := []struct {
		e          encoding.Encoding
		dSrc, eSrc string
	}{
		{charmap.Windows1252, "\x81", "갂"},
		{japanese.EUCJP, "\xfe\xfc", "갂"},
		{japanese.ISO2022JP, "\x1b$B\x7e\x7e", "갂"},
		{japanese.ShiftJIS, "\xef\xfc", "갂"},
		{korean.EUCKR, "\xfe\xfe", "א"},
		{simplifiedchinese.GBK, "\xfe\xfe", "갂"},
		{simplifiedchinese.HZGB2312, "~{z~", "갂"},
		{traditionalchinese.Big5, "\x81\x40", "갂"},
	}
	for _, tc := range testCases {
		for _, direction := range []string{"Decode", "Encode"} {
			enc, want, src := (transform.Transformer)(nil), "", ""
			if direction == "Decode" {
				enc, want, src = tc.e.NewDecoder(), "\ufffd", tc.dSrc
			} else {
				enc, want, src = tc.e.NewEncoder(), "\x1a", tc.eSrc
			}

			dst, err := ioutil.ReadAll(transform.NewReader(strings.NewReader(src), enc))
			if err != nil {
				t.Errorf("%s %v: %v", direction, tc.e, err)
				continue
			}
			if got := string(dst); got != want {
				t.Errorf("%s %v:\ngot  %q\nwant %q", direction, tc.e, got, want)
				continue
			}
		}
	}
}
Exemple #11
0
func getReader(file *os.File) io.Reader {
	encoding := getEncoding()
	if encoding == nil {
		return file
	}
	return transform.NewReader(file, encoding.NewDecoder())
}
Exemple #12
0
// EucToUtf8 convert euc encoded string to utf-8 encoded string
func EucToUtf8(data string) (string, error) {
	in := bytes.NewBufferString(data)
	out := new(bytes.Buffer)
	reader := transform.NewReader(in, japanese.EUCJP.NewDecoder())
	_, e := io.Copy(out, reader)
	return out.String(), e
}
Exemple #13
0
func TestReader(t *testing.T) {
	for _, tc := range sniffTestCases {
		content, err := ioutil.ReadFile("testdata/" + tc.filename)
		if err != nil {
			t.Errorf("%s: error reading file: %v", tc.filename, err)
			continue
		}

		r, err := NewReader(bytes.NewReader(content), tc.declared)
		if err != nil {
			t.Errorf("%s: error creating reader: %v", tc.filename, err)
			continue
		}

		got, err := ioutil.ReadAll(r)
		if err != nil {
			t.Errorf("%s: error reading from charset.NewReader: %v", tc.filename, err)
			continue
		}

		e, _ := Lookup(tc.want)
		want, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader(content), e.NewDecoder()))
		if err != nil {
			t.Errorf("%s: error decoding with hard-coded charset name: %v", tc.filename, err)
			continue
		}

		if !bytes.Equal(got, want) {
			t.Errorf("%s: got %q, want %q", tc.filename, got, want)
			continue
		}
	}
}
Exemple #14
0
func (s *SASLSuite) TestScramNormalizesPassword(c *C) {
	// From: libidn-1.9/tests/tst_stringprep.c
	// See RFC 4013, section 3
	testCases := []struct {
		raw        string
		normalized string
	}{
		{"I\xC2\xADX", "IX"},
		{"user", "user"},
		{"USER", "USER"},
		{"\xC2\xAA", "a"},
		{"x\xC2\xADy", "xy"},
		{"\xE2\x85\xA3", "IV"},
		{"\xE2\x85\xA8", "IX"},
		//They should error because they have forbidden chars
		//{"\x07", ""},      //should error
		//{"\xD8\xA71", ""}, //shold error
	}

	for _, test := range testCases {
		t := transform.NewReader(strings.NewReader(test.raw), Stringprep)
		r := bufio.NewReader(t)
		normalized, _, err := r.ReadLine()

		c.Check(err, IsNil)
		c.Check(normalized, DeepEquals, []byte(test.normalized))
	}
}
Exemple #15
0
func getReader(reader io.Reader) io.Reader {
	encoding := getEncoding()
	if encoding == nil {
		return reader
	}
	return transform.NewReader(reader, encoding.NewDecoder())
}
Exemple #16
0
func getGbkDoc(client *http.Client, url string) (*goquery.Document, error) {
	retry := 3
get:
	resp, err := client.Get(url)
	if err != nil {
		if retry > 0 {
			retry--
			goto get
		} else {
			return nil, me(err, "get")
		}
	}
	defer resp.Body.Close()
	r := transform.NewReader(resp.Body, simplifiedchinese.GBK.NewDecoder())
	doc, err := goquery.NewDocumentFromReader(r)
	if err != nil {
		if retry > 0 {
			retry--
			goto get
		} else {
			return nil, me(err, "new document from response")
		}
	}
	return doc, nil
}
Exemple #17
0
func (s *SASLSuite) TestScramNormalizesPassword(c *C) {
	// From: libidn-1.9/tests/tst_stringprep.c
	// See RFC 4013, section 3
	testCases := []struct {
		raw        string
		normalized string
	}{
		{"I\xC2\xADX", "IX"},
		{"user", "user"},
		{"USER", "USER"},
		{"user\u200B", "user "},
		{"user\u2002", "user "},
		{"\xC2\xAA", "a"},
		{"x\xC2\xADy", "xy"},
		{"\xE2\x85\xA3", "IV"},
		{"\xE2\x85\xA8", "IX"},
		{"\u034F\u1806\u180Bb\u180C\u180Dy\u200Ct\u200D\u2060\uFE00e\uFE01\uFE02\uFE03\uFE04\uFE05\uFE06\uFE07\uFE08\uFE09\uFE0A\uFE0B\uFE0C\uFE0D\uFE0E\uFE0F\uFEFF", "byte"},
		//They should error because they have forbidden chars
		//{"\x07", ""},      //should error
		//{"\xD8\xA71", ""}, //shold error
	}

	for _, test := range testCases {
		t := transform.NewReader(strings.NewReader(test.raw), Stringprep)
		r := bufio.NewReader(t)
		normalized, _, err := r.ReadLine()

		c.Check(err, IsNil)
		c.Check(string(normalized), DeepEquals, test.normalized)
	}
}
Exemple #18
0
// NewReaderLabel returns a reader that converts from the specified charset to
// UTF-8. It uses Lookup to find the encoding that corresponds to label, and
// returns an error if Lookup returns nil. It is suitable for use as
// encoding/xml.Decoder's CharsetReader function.
func NewReaderLabel(label string, input io.Reader) (io.Reader, error) {
	e, _ := Lookup(label)
	if e == nil {
		return nil, fmt.Errorf("unsupported charset: %q", label)
	}
	return transform.NewReader(input, e.NewDecoder()), nil
}
Exemple #19
0
func main() {
	t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
	r := transform.NewReader(os.Stdin, t)
	if _, err := io.Copy(os.Stdout, r); err != nil {
		log.Fatal(err)
	}
}
Exemple #20
0
func defaultCharsetReader(cs string, input io.Reader) (io.Reader, error) {
	e, _ := charset.Lookup(cs)
	if e == nil {
		return nil, fmt.Errorf("cannot decode charset %v", cs)
	}
	return transform.NewReader(input, e.NewDecoder()), nil
}
Exemple #21
0
func ParseFeed(c appengine.Context, contentType, origUrl, fetchUrl string, body []byte) (*Feed, []*Story, error) {
	cr := defaultCharsetReader
	if !bytes.EqualFold(body[:len(xml.Header)], []byte(xml.Header)) {
		enc, err := encodingReader(body, contentType)
		if err != nil {
			return nil, nil, err
		}
		if enc != encoding.Nop {
			cr = nilCharsetReader
			body, err = ioutil.ReadAll(transform.NewReader(bytes.NewReader(body), enc.NewDecoder()))
			if err != nil {
				return nil, nil, err
			}
		}
	}
	var feed *Feed
	var stories []*Story
	var atomerr, rsserr, rdferr error
	feed, stories, atomerr = parseAtom(c, body, cr)
	if feed == nil {
		feed, stories, rsserr = parseRSS(c, body, cr)
	}
	if feed == nil {
		feed, stories, rdferr = parseRDF(c, body, cr)
	}
	if feed == nil {
		c.Warningf("atom parse error: %s", atomerr.Error())
		c.Warningf("xml parse error: %s", rsserr.Error())
		c.Warningf("rdf parse error: %s", rdferr.Error())
		return nil, nil, fmt.Errorf("Could not parse feed data")
	}
	feed.Url = origUrl
	return parseFix(c, feed, stories, fetchUrl)
}
Exemple #22
0
//Read read csv for handle
func ReadLines(file string, isGbk bool) (lines [][]string, err error) {
	//catch panic
	defer func() {
		if rerr := recover(); rerr != nil {
			err = errors.New(fmt.Sprintf("read csv file: %v, error: %v", file, rerr))
		}
	}()

	//open file
	fi, err := os.Open(file)
	if err != nil {
		return nil, err
	}
	defer fi.Close()
	//get reader
	var reader *csv.Reader
	if !isGbk {
		reader = csv.NewReader(fi)
	} else {
		//transform gbk to utf8
		r := transform.NewReader(fi, simplifiedchinese.GBK.NewDecoder())
		reader = csv.NewReader(r)
	}
	lines, err = reader.ReadAll()
	return
}
Exemple #23
0
// NewReaderByName returns a reader that converts from the specified charset to
// UTF-8. It returns an error if the charset is not one of the standard
// encodings for HTML. It is suitable for use as encoding/xml.Decoder's
// CharsetReader function.
func NewReaderByName(charset string, input io.Reader) (io.Reader, error) {
	e, _ := Lookup(charset)
	if e == nil {
		return nil, fmt.Errorf("unsupported charset: %q", charset)
	}
	return transform.NewReader(input, e.NewDecoder()), nil
}
Exemple #24
0
func Decode(src string) (dst string) {
	data, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(src)), simplifiedchinese.GBK.NewEncoder()))
	if err == nil {
		dst = string(data)
	}
	return
}
Exemple #25
0
func NewUTF8Reader(label string, r io.Reader) (io.Reader, error) {
	e, _ := charset.Lookup(label)
	if e == nil {
		return nil, fmt.Errorf("unsupported charset: %q", label)
	}
	return transform.NewReader(r, unicode.BOMOverride(e.NewDecoder())), nil
}
Exemple #26
0
func charsetReader(charset string, input io.Reader) (io.Reader, error) {
	// Windows-1252 is a superset of ISO-8859-1.
	if strings.ToLower(charset) == "iso-8859-1" {
		return transform.NewReader(input, charmap.Windows1252.NewDecoder()), nil
	}
	return nil, fmt.Errorf("unsupported charset: %q", charset)
}
Exemple #27
0
func decodeRowsHtmlIsracard(r io.Reader) (rows []Row, err error) {
	log.Print("isracard")

	rInUTF8 := transform.NewReader(r, charmap.ISO8859_8I.NewDecoder())

	doc, err := goquery.NewDocumentFromReader(rInUTF8)
	if err != nil {
		return nil, err
	}

	done := false

	doc.Find("tr").Each(func(i int, s *goquery.Selection) {
		if i < 3 || i == 4 {
			return
		}

		isHeader := (len(rows) == 0)

		var row Row
		s.Find("td").Each(func(i int, s *goquery.Selection) {
			value := strings.TrimSpace(s.Text())

			if isHeader {
				if value == "" {
					value = fmt.Sprintf("field_%v", i)
				}
			}
			// log.Printf("Field: %v [%v]", value, i)
			row = append(row, value)
		})

		if len(row) < 2 {
			if !done {
				log.Println("Heuristic: wrong file format, giving up")
				rows = nil
				done = true
			}
			return
		}

		if row[0] == "" {
			row[0] = "1970-01-01"
		}

		if row[1] == "@סך חיוב בש\"ח:" {
			done = true
		}

		row[1] = strings.TrimPrefix(row[1], "\u200f")

		if !done {
			log.Printf("%#v", row)
			rows = append(rows, row)
		}
	})

	return rows, nil
}
Exemple #28
0
func bodyToUTF8(body []byte, contentType string) (*transform.Reader, error) {
	enc, _, _ := charset.DetermineEncoding(body, contentType)
	if enc == encoding.Nop {
		return nil, werrors.New(ErrEncodingNotFound)
	}

	return transform.NewReader(bytes.NewReader(body), enc.NewDecoder()), nil
}
func (s *KittychanInfoSource) ScrapeFromReader(reader io.Reader) (*feeds.Feed, error) {
	decodedReader := transform.NewReader(reader, japanese.ShiftJIS.NewDecoder())
	doc, err := goquery.NewDocumentFromReader(decodedReader)
	if err != nil {
		return nil, err
	}
	return s.ScrapeFromDocument(doc)
}
Exemple #30
0
func Utf8ToGbk(s []byte) ([]byte, error) {
	reader := transform.NewReader(bytes.NewReader(s), simplifiedchinese.GBK.NewEncoder())
	d, e := ioutil.ReadAll(reader)
	if e != nil {
		return nil, e
	}
	return d, nil
}