示例#1
0
文件: util.go 项目: fanyang01/crawler
func NewUTF8Reader(label string, r io.Reader) (io.Reader, error) {
	e, _ := charset.Lookup(label)
	if e == nil {
		return nil, fmt.Errorf("unsupported charset: %q", label)
	}
	return transform.NewReader(r, unicode.BOMOverride(e.NewDecoder())), nil
}
示例#2
0
func TestBOMOverride(t *testing.T) {
	dec := unicode.BOMOverride(charmap.CodePage437.NewDecoder())
	dst := make([]byte, 100)
	for i, tc := range []struct {
		src   string
		atEOF bool
		dst   string
		nSrc  int
		err   error
	}{
		0:  {"H\x82ll\x93", true, "Héllô", 5, nil},
		1:  {"\uFEFFHéllö", true, "Héllö", 10, nil},
		2:  {"\xFE\xFF\x00H\x00e\x00l\x00l\x00o", true, "Hello", 12, nil},
		3:  {"\xFF\xFEH\x00e\x00l\x00l\x00o\x00", true, "Hello", 12, nil},
		4:  {"\uFEFF", true, "", 3, nil},
		5:  {"\xFE\xFF", true, "", 2, nil},
		6:  {"\xFF\xFE", true, "", 2, nil},
		7:  {"\xEF\xBB", true, "\u2229\u2557", 2, nil},
		8:  {"\xEF", true, "\u2229", 1, nil},
		9:  {"", true, "", 0, nil},
		10: {"\xFE", true, "\u25a0", 1, nil},
		11: {"\xFF", true, "\u00a0", 1, nil},
		12: {"\xEF\xBB", false, "", 0, transform.ErrShortSrc},
		13: {"\xEF", false, "", 0, transform.ErrShortSrc},
		14: {"", false, "", 0, transform.ErrShortSrc},
		15: {"\xFE", false, "", 0, transform.ErrShortSrc},
		16: {"\xFF", false, "", 0, transform.ErrShortSrc},
		17: {"\xFF\xFE", false, "", 0, transform.ErrShortSrc},
	} {
		dec.Reset()
		nDst, nSrc, err := dec.Transform(dst, []byte(tc.src), tc.atEOF)
		got := string(dst[:nDst])
		if nSrc != tc.nSrc {
			t.Errorf("%d: nSrc: got %d; want %d", i, nSrc, tc.nSrc)
		}
		if got != tc.dst {
			t.Errorf("%d: got %+q; want %+q", i, got, tc.dst)
		}
		if err != tc.err {
			t.Errorf("%d: error: got %v; want %v", i, err, tc.err)
		}
	}
}
示例#3
0
// Visit in a FileVisitor is just taking care of opening/closing files
func (v *FileVisitor) Visit(fn VisitorFunc) error {
	var f *os.File
	if v.Path == constSTDINstr {
		f = os.Stdin
	} else {
		var err error
		if f, err = os.Open(v.Path); err != nil {
			return err
		}
	}
	defer f.Close()

	// TODO: Consider adding a flag to force to UTF16, apparently some
	// Windows tools don't write the BOM
	utf16bom := unicode.BOMOverride(unicode.UTF8.NewDecoder())
	v.StreamVisitor.Reader = transform.NewReader(f, utf16bom)

	return v.StreamVisitor.Visit(fn)
}
示例#4
0
// NewReader wraps a Reader to decode Unicode to UTF-8 as it reads.
func NewReader(r io.Reader, d EncodingHint) io.Reader {
	var decoder *encoding.Decoder
	switch d {
	case UTF8:
		// Make a transformer that assumes UTF-8 but abides by the BOM.
		decoder = unicode.UTF8.NewDecoder()
	case UTF16LE:
		// Make an tranformer that decodes MS-Windows (16LE) UTF files:
		winutf := unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM)
		// Make a transformer that is like winutf, but abides by BOM if found:
		decoder = winutf.NewDecoder()
	case UTF16BE:
		// Make an tranformer that decodes UTF-16BE files:
		utf16be := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
		// Make a transformer that is like utf16be, but abides by BOM if found:
		decoder = utf16be.NewDecoder()
	}

	// Make a Reader that uses utf16bom:
	return transform.NewReader(r, unicode.BOMOverride(decoder))
}
示例#5
0
// Creates a scanner similar to os.Open() but decodes the file as UTF-16
// if the special byte order mark is present.
func newScannerUTF16or8(filename string) (utfScanner, error) {

	// Read the file into a []byte:
	file, err := os.Open(filename)
	if err != nil {
		return nil, err
	}

	// Check for BOM
	marker := make([]byte, 2)
	numread, err := io.ReadAtLeast(file, marker, 2)
	file.Seek(0, 0)
	if numread == 2 && err == nil && ((marker[0] == 0xFE && marker[1] == 0xFF) || (marker[0] == 0xFF && marker[1] == 0xFE)) {
		// Make an tranformer that converts MS-Win default to UTF8:
		win16be := unicode.UTF16(unicode.BigEndian, unicode.UseBOM)
		// Make a transformer that is like win16be, but abides by BOM:
		utf16bom := unicode.BOMOverride(win16be.NewDecoder())

		// Make a Reader that uses utf16bom:
		unicodeReader := transform.NewReader(file, utf16bom)
		return unicodeReader, nil
	}
	return file, nil
}
示例#6
0
文件: util.go 项目: fanyang01/crawler
func ConvToUTF8(b []byte, e encoding.Encoding) (result []byte, err error) {
	reader := transform.NewReader(bytes.NewReader(b), unicode.BOMOverride(e.NewDecoder()))
	return ioutil.ReadAll(reader)
}
示例#7
0
文件: lexer.go 项目: mewmew/uc
// newUnicodeReader wraps r to decode Unicode to UTF-8 as its reads.
func newUnicodeReader(r io.Reader) io.Reader {
	// fallback to r if no BOM sequence is located in the source text.
	t := unicode.BOMOverride(transform.Nop)
	return transform.NewReader(r, t)
}