func ExampleFullRune() { buf := []byte{228, 184, 150} // 世 fmt.Println(utf8.FullRune(buf)) fmt.Println(utf8.FullRune(buf[:2])) // Output: // true // false }
func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) { for n < len(src) { // ASCII fast path. if src[n] < utf8.RuneSelf { n++ continue } r, size := utf8.DecodeRune(src[n:]) // Look for a valid non-ASCII rune. if r != utf8.RuneError || size != 1 { n += size continue } // Look for short source data. if !atEOF && !utf8.FullRune(src[n:]) { err = transform.ErrShortSrc break } // We have an invalid rune. err = transform.ErrEndOfSpan break } return n, err }
func (rd *Reader) readRuneBackward() (r rune, size int, err error) { var bytes [4]byte size = 0 read_next_byte: if rd.off == 0 { if size == 0 { return 0, 0, io.EOF } // this means we wanted to read another byte // because we don't have a valid utf character // yet but there are not anymore... // TODO: handle that panic("partial utf8 at end of buffer not yet implemented") } if rd.offInPiece <= 0 { rd.piece = rd.piece.prev rd.offInPiece = rd.piece.off2 } bytes[size] = rd.buf.sliceOfPiece(rd.piece)[rd.offInPiece-1] size++ rd.offInPiece-- rd.off-- if rd.offInPiece <= 0 { rd.piece = rd.piece.prev rd.offInPiece = rd.piece.off2 } if utf8.FullRune(bytes[:size]) { r, size = utf8.DecodeRune(bytes[:size]) return r, size, nil } // not a full rune read another byte into the // buffer and try again goto read_next_byte }
func (p *translateToUTF8) Translate(data []byte, eof bool) (int, []byte, error) { p.scratch = ensureCap(p.scratch, (len(data))*errorRuneLen) buf := p.scratch[:0] for i := 0; i < len(data); { // fast path for ASCII if b := data[i]; b < utf8.RuneSelf { buf = append(buf, b) i++ continue } _, size := utf8.DecodeRune(data[i:]) if size == 1 { if !eof && !utf8.FullRune(data) { // When DecodeRune has converted only a single // byte, we know there must be some kind of error // because we know the byte's not ASCII. // If we aren't at EOF, and it's an incomplete // rune encoding, then we return to process // the final bytes in a subsequent call. return i, buf, nil } buf = append(buf, errorBytes...) } else { buf = append(buf, data[i:i+size]...) } i += size } return len(data), buf, nil }
func (f FileLoggingCmdRunner) truncateUntilToken(data []byte, dataLossLimit int64) []byte { var i int64 // Cut off until first line break unless it cuts off more allowed data loss if i = int64(bytes.IndexByte(data, '\n')); i >= 0 && i <= dataLossLimit { data = f.dropCR(data[i+1:]) } else { // Make sure we don't break inside UTF encoded rune for { if len(data) < 1 { break } // Check for ASCII if data[0] < utf8.RuneSelf { break } // Check for UTF _, width := utf8.DecodeRune(data) if width > 1 && utf8.FullRune(data) { break } // Rune is not complete, check next data = data[1:] } } return data }
// ReadRune returns the next UTF-8 encoded code point from the // io.Reader inside r. func (r *readRune) ReadRune() (rr rune, size int, err error) { r.buf[0], err = r.readByte() if err != nil { return 0, 0, err } if r.buf[0] < utf8.RuneSelf { // fast check for common ASCII case rr = rune(r.buf[0]) size = 1 // Known to be 1. return } var n int for n = 1; !utf8.FullRune(r.buf[0:n]); n++ { r.buf[n], err = r.readByte() if err != nil { if err == io.EOF { err = nil break } return } } rr, size = utf8.DecodeRune(r.buf[0:n]) if size < n { // an error r.unread(r.buf[size:n]) } return }
func (p *translateToCodePage) Translate(data []byte, eof bool) (int, []byte, error) { p.scratch = ensureCap(p.scratch, len(data)) buf := p.scratch[:0] for i := 0; i < len(data); { r := rune(data[i]) size := 1 if r >= utf8.RuneSelf { r, size = utf8.DecodeRune(data[i:]) if size == 1 && !eof && !utf8.FullRune(data[i:]) { return i, buf, nil } } var b byte if r < p.same { b = byte(r) } else { var ok bool b, ok = p.rune2byte[r] if !ok { b = '?' } } buf = append(buf, b) i += size } return len(data), buf, nil }
func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { n := len(src) if n > len(dst) { n = len(dst) } for i := 0; i < n; { if c := src[i]; c < utf8.RuneSelf { dst[i] = c i++ continue } _, size := utf8.DecodeRune(src[i:]) if size == 1 { // All valid runes of size 1 (those below utf8.RuneSelf) were // handled above. We have invalid UTF-8 or we haven't seen the // full character yet. err = ErrInvalidUTF8 if !atEOF && !utf8.FullRune(src[i:]) { err = transform.ErrShortSrc } return i, i, err } if i+size > len(dst) { return i, i, transform.ErrShortDst } for ; size > 0; size-- { dst[i] = src[i] i++ } } if len(src) > len(dst) { err = transform.ErrShortDst } return n, n, err }
func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { r, size := rune(0), 0 for ; nSrc < len(src); nSrc += size { r = rune(src[nSrc]) // Decode a 1-byte rune. if r < utf8.RuneSelf { size = 1 } else { // Decode a multi-byte rune. r, size = utf8.DecodeRune(src[nSrc:]) if size == 1 { // All valid runes of size 1 (those below utf8.RuneSelf) were // handled above. We have invalid UTF-8 or we haven't seen the // full character yet. if !atEOF && !utf8.FullRune(src[nSrc:]) { err = transform.ErrShortSrc break } r = '\ufffd' } } if nDst+utf8.RuneLen(r) > len(dst) { err = transform.ErrShortDst break } nDst += utf8.EncodeRune(dst[nDst:], r) } return nDst, nSrc, err }
// ScanRunes is a split function for a Scanner that returns each // UTF-8-encoded rune as a token. The sequence of runes returned is // equivalent to that from a range loop over the input as a string, which // means that erroneous UTF-8 encodings translate to U+FFFD = "\xef\xbf\xbd". // Because of the Scan interface, this makes it impossible for the client to // distinguish correctly encoded replacement runes from encoding errors. func ScanRunes(data []byte, atEOF bool) (advance int, token []byte, err error) { if atEOF && len(data) == 0 { return 0, nil, nil } // Fast path 1: ASCII. if data[0] < utf8.RuneSelf { return 1, data[0:1], nil } // Fast path 2: Correct UTF-8 decode without error. _, width := utf8.DecodeRune(data) if width > 1 { // It's a valid encoding. Width cannot be one for a correctly encoded // non-ASCII rune. return width, data[0:width], nil } // We know it's an error: we have width==1 and implicitly r==utf8.RuneError. // Is the error because there wasn't a full rune to be decoded? // FullRune distinguishes correctly between erroneous and incomplete encodings. if !atEOF && !utf8.FullRune(data) { // Incomplete; get more bytes. return 0, nil, nil } // We have a real UTF-8 encoding error. Return a properly encoded error rune // but advance only one byte. This matches the behavior of a range loop over // an incorrectly encoded string. return 1, errorRune, nil }
func main() { client, err := sarama.NewClient("a_logger_for_mhub", []string{"localhost:9092"}, nil) if err != nil { panic(err) } else { os.Stderr.WriteString("> connected\n") } defer client.Close() consumer, err := sarama.NewConsumer(client, "received", 0, "", nil) if err != nil { panic(err) } else { os.Stderr.WriteString("> consumer ready\n") } defer consumer.Close() for { select { case event := <-consumer.Events(): if event.Err != nil { panic(event.Err) } fmt.Println(utf8.FullRune(event.Value)) } } }
func (d *cmapEncoder) Transform(dst, src []byte, atEOF bool) (int, int, error) { var e error var ndst, nsrc int for nsrc < len(src) { if ndst >= len(dst) { e = transform.ErrShortDst break } r, sz := utf8.DecodeRune(src[nsrc:]) if r == utf8.RuneError && sz == 1 { // If its inconclusive due to insufficient data in // in the source, report it if !atEOF && !utf8.FullRune(src[nsrc:]) { e = transform.ErrShortSrc break } } if c, ok := d.bytes[r]; ok { dst[ndst] = c } else { dst[ndst] = d.replace } nsrc += sz ndst++ } return ndst, nsrc, e }
func (t replaceTransformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { var runeBytes [utf8.UTFMax]byte for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] { if r = rune(src[0]); r < utf8.RuneSelf { sz = 1 } else { r, sz = utf8.DecodeRune(src) if sz == 1 { // Invalid rune. if !atEOF && !utf8.FullRune(src) { err = transform.ErrShortSrc break } } } dsz := utf8.EncodeRune(runeBytes[:], t(r)) if nDst+dsz > len(dst) { err = transform.ErrShortDst break } nDst += copy(dst[nDst:], runeBytes[:dsz]) nSrc += sz } return }
func (m charmapEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { r, size := rune(0), 0 loop: for nSrc < len(src) { if nDst >= len(dst) { err = transform.ErrShortDst break } r = rune(src[nSrc]) // Decode a 1-byte rune. if r < utf8.RuneSelf { if m.charmap.asciiSuperset { nSrc++ dst[nDst] = uint8(r) nDst++ continue } size = 1 } else { // Decode a multi-byte rune. r, size = utf8.DecodeRune(src[nSrc:]) if size == 1 { // All valid runes of size 1 (those below utf8.RuneSelf) were // handled above. We have invalid UTF-8 or we haven't seen the // full character yet. if !atEOF && !utf8.FullRune(src[nSrc:]) { err = transform.ErrShortSrc } else { err = internal.RepertoireError(m.charmap.replacement) } break } } // Binary search in [low, high) for that rune in the m.charmap.encode table. for low, high := int(m.charmap.low), 0x100; ; { if low >= high { err = internal.RepertoireError(m.charmap.replacement) break loop } mid := (low + high) / 2 got := m.charmap.encode[mid] gotRune := rune(got & (1<<24 - 1)) if gotRune < r { low = mid + 1 } else if gotRune > r { high = mid } else { dst[nDst] = byte(got >> 24) nDst++ break } } nSrc += size } return nDst, nSrc, err }
func fullRuneBuffered(br *bufio.Reader) bool { n := br.Buffered() buf, err := br.Peek(n) if err != nil { return false } return utf8.FullRune(buf) }
// next reads and returns the next Unicode character. It is designed such // that only a minimal amount of work needs to be done in the common ASCII // case (one test to check for both ASCII and end-of-buffer, and one test // to check for newlines). func (s *Scanner) next() rune { ch := rune(s.srcBuf[s.srcPos]) if ch >= utf8.RuneSelf { // uncommon case: not ASCII or not enough bytes for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) { // not enough bytes: read some more, but first // save away token text if any if s.tokPos >= 0 { s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos]) s.tokPos = 0 } // move unread bytes to beginning of buffer copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd]) s.srcBufOffset += s.srcPos // read more bytes i := s.srcEnd - s.srcPos n, err := s.src.Read(s.srcBuf[i:bufLen]) s.srcEnd = i + n s.srcPos = 0 s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel if err != nil { if s.srcEnd == 0 { return EOF } if err != io.EOF { s.error(err.Error()) break } } } // at least one byte ch = rune(s.srcBuf[s.srcPos]) if ch >= utf8.RuneSelf { // uncommon case: not ASCII var width int ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd]) if ch == utf8.RuneError && width == 1 { s.error("illegal UTF-8 encoding") } s.srcPos += width - 1 } } s.srcPos++ s.column++ switch ch { case 0: // implementation restriction for compatibility with other tools s.error("illegal character NUL") case '\n': s.line++ s.column = 0 } return ch }
// ReadRune implements io.RuneReader interface. func (s *ByteSlice) ReadRune() (r rune, size int, err error) { if !utf8.FullRune(*s) { return utf8.RuneError, 0, io.ErrUnexpectedEOF } r, size = utf8.DecodeRune(*s) *s = (*s)[size:] return r, size, err }
func (s *source) getr() rune { redo: s.r0, s.line0 = s.r, s.line // We could avoid at least one test that is always taken in the // for loop below by duplicating the common case code (ASCII) // here since we always have at least the sentinel (utf8.RuneSelf) // in the buffer. Measure and optimize if necessary. // make sure we have at least one rune in buffer, or we are at EOF for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.err == nil && s.w-s.r < len(s.buf) { s.fill() // s.w-s.r < len(s.buf) => buffer is not full } // common case: ASCII and enough bytes // (invariant: s.buf[s.w] == utf8.RuneSelf) if b := s.buf[s.r]; b < utf8.RuneSelf { s.r++ if b == 0 { s.error("invalid NUL character") goto redo } if b == '\n' { s.line++ } return rune(b) } // EOF if s.r == s.w { if s.err != io.EOF { s.error(s.err.Error()) } return -1 } // uncommon case: not ASCII r, w := utf8.DecodeRune(s.buf[s.r:s.w]) s.r += w if r == utf8.RuneError && w == 1 { s.error("invalid UTF-8 encoding") goto redo } // BOM's are only allowed as the first character in a file const BOM = 0xfeff if r == BOM { if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1) s.error("invalid BOM in the middle of the file") } goto redo } return r }
func (buf *buffer) ReadRune() (r rune, size int, err error) { l := buf.b.Len() chunk := make([]byte, utf8.UTFMax) if l > 0 { n, err := buf.b.Read(chunk) if err != nil { return 0, 0, err } if utf8.FullRune(chunk) { r, rL := utf8.DecodeRune(chunk) if n > rL { buf.PutBack(chunk[rL:n]) } if buf.collect { buf.collection.WriteRune(r) } return r, rL, nil } } // else add bytes from the file, then try that for l < utf8.UTFMax { fn, err := buf.f.Read(chunk[l : l+1]) if err != nil { return 0, 0, err } l = l + fn if utf8.FullRune(chunk) { r, rL := utf8.DecodeRune(chunk) if buf.collect { buf.collection.WriteRune(r) } if fn > 0 { if _, err := buf.output.Write(chunk[l : l+fn]); err != nil { return r, rL, err } } return r, rL, nil } } return 0, 0, errors.New("File is not a valid UTF=8 encoding") }
func (b *runeBuffer) ReadRune() (ru rune, size int, err error) { for !utf8.FullRune(b.buf[b.r:]) { if err := b.fill(); err != nil { return 0, 0, err } } ru, size = utf8.DecodeRune(b.buf[b.r:]) b.r += size return ru, size, nil }
// 将一个字节数组转换成 utf-8 字符串 func Utf8(bs []byte) (str string, err error) { if utf8.FullRune(bs) { //sz := utf8.RuneCount(bs) str = string(bs) return } // 错误 err = errors.New("fail to decode to UTF8") str = "" return }
func main() { b := []byte("Helloł, 世界") for len(b) > 0 { r, size := utf8.DecodeRune(b) fmt.Printf("%c %v\n", r, size) b = b[size:] } // FullRune example fmt.Println() fmt.Println() buf1 := []byte{228, 184, 150} // 世 buf2 := []byte{228} // 世 fmt.Println(utf8.FullRune(buf1)) fmt.Println(utf8.FullRune(buf2)) }
// Write encodes and writes the data from p. func (w *Writer) Write(p []byte) (n int, err error) { n = len(p) if len(w.inbuf) > 0 { w.inbuf = append(w.inbuf, p...) p = w.inbuf } if len(w.outbuf) < len(p) { w.outbuf = make([]byte, len(p)+10) } outpos := 0 for len(p) > 0 { rune, size := utf8.DecodeRune(p) if rune == 0xfffd && !utf8.FullRune(p) { break } p = p[size:] retry: size, status := w.encode(w.outbuf[outpos:], rune) if status == NO_ROOM { newDest := make([]byte, len(w.outbuf)*2) copy(newDest, w.outbuf) w.outbuf = newDest goto retry } if status == STATE_ONLY { outpos += size goto retry } outpos += size } w.inbuf = w.inbuf[:0] if len(p) > 0 { w.inbuf = append(w.inbuf, p...) } n1, err := w.wr.Write(w.outbuf[0:outpos]) if err != nil && n1 < n { n = n1 } return }
// DataToString converts data bytes to readable string. func DataToString(b []byte) string { if !utf8.FullRune(b) { return fmt.Sprintf("%#v", b) } s := string(b) for _, r := range s { if !unicode.IsPrint(r) { return fmt.Sprintf("%#v", b) } } return s }
//nextChar read next utf-8 character func (lexer *Lexer) nextChar() error { c, err := lexer.reader.ReadByte() if err != nil { if err == io.EOF { lexer.curr = rune(TokenEOF) return nil } return err } lexer.offset++ //not ASCII if c >= utf8.RuneSelf { lexer.buff[0] = c lexer.buffPos = 1 for !utf8.FullRune(lexer.buff[0:lexer.buffPos]) { //continue read rest utf8 char bytes c, err = lexer.reader.ReadByte() if err != nil { if err == io.EOF { lexer.curr = rune(TokenEOF) return nil } return err } lexer.buff[lexer.buffPos] = c lexer.buffPos++ gserrors.Assert( lexer.buffPos < len(lexer.buff), "utf8.UTFMax must << len(lexer.buff)", ) } c, width := utf8.DecodeRune(lexer.buff[0:lexer.buffPos]) if c == utf8.RuneError && width == 1 { return lexer.newerror("illegal utf8 character") } lexer.curr = c } else { lexer.curr = rune(c) } lexer.position.Column++ return nil }
func (t *tScreen) parseRune(buf *bytes.Buffer) (bool, bool) { b := buf.Bytes() if b[0] >= ' ' && b[0] <= 0x7F { // printable ASCII easy to deal with -- no encodings ev := NewEventKey(KeyRune, rune(b[0]), ModNone) t.PostEvent(ev) buf.ReadByte() return true, true } if b[0] < 0x80 { // No encodings start with low numbered values return false, false } switch t.charset { case "UTF-8": if utf8.FullRune(b) { r, _, e := buf.ReadRune() if e == nil { ev := NewEventKey(KeyRune, r, ModNone) t.PostEvent(ev) return true, true } } case "US-ASCII": // ASCII cannot generate this, so most likely it was // entered as an Alt sequence ev := NewEventKey(KeyRune, rune(b[0]-128), ModAlt) t.PostEvent(ev) buf.ReadByte() return true, true default: utfb := make([]byte, 12) for l := 1; l <= len(b); l++ { t.decoder.Reset() nout, nin, _ := t.decoder.Transform(utfb, b[:l], true) if nout != 0 { if r, _ := utf8.DecodeRune(utfb[:nout]); r != utf8.RuneError { ev := NewEventKey(KeyRune, r, ModNone) t.PostEvent(ev) } for eat := 0; eat < nin; eat++ { buf.ReadByte() } return true, true } } } // Looks like potential escape return true, false }
func (u *utf32Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { if u.currentBOMPolicy&writeBOM != 0 { if len(dst) < 4 { return 0, 0, transform.ErrShortDst } dst[0], dst[1], dst[2], dst[3] = 0x00, 0x00, 0xfe, 0xff u.currentBOMPolicy = IgnoreBOM nDst = 4 } r, size := rune(0), 0 for nSrc < len(src) { r = rune(src[nSrc]) // Decode a 1-byte rune. if r < utf8.RuneSelf { size = 1 } else { // Decode a multi-byte rune. r, size = utf8.DecodeRune(src[nSrc:]) if size == 1 { // All valid runes of size 1 (those below utf8.RuneSelf) were // handled above. We have invalid UTF-8 or we haven't seen the // full character yet. if !atEOF && !utf8.FullRune(src[nSrc:]) { err = transform.ErrShortSrc break } } } if nDst+4 > len(dst) { err = transform.ErrShortDst break } dst[nDst+0] = uint8(r >> 24) dst[nDst+1] = uint8(r >> 16) dst[nDst+2] = uint8(r >> 8) dst[nDst+3] = uint8(r) nDst += 4 nSrc += size } if u.endianness == LittleEndian { for i := 0; i < nDst; i += 4 { dst[i], dst[i+1], dst[i+2], dst[i+3] = dst[i+3], dst[i+2], dst[i+1], dst[i] } } return nDst, nSrc, err }
func main() { str := "Étoilé" rune := make([]byte, 0, 4) for i := 0; i < len(str); i++ { rune = append(rune, str[i]) if utf8.FullRune(rune) { char, _ := utf8.DecodeRune(rune) fmt.Printf("%c", char) rune = rune[0:0] } } fmt.Printf("\n") }
func (o SFilePort) ReadRune() (r rune, size int, err error) { buf := []byte{} for !utf8.FullRune(buf) { var c byte c, err = o.ReadByte() if err != nil { r = 0 return } buf = append(buf, c) } ruf := []rune(string(buf)) return ruf[0], len(buf), nil }
func (l *lexer) getr() rune { // unread rune != 0 available if r := l.peekr1; r != 0 { l.peekr1 = l.peekr2 l.peekr2 = 0 if r == '\n' && importpkg == nil { lexlineno++ } return r } redo: // common case: 7bit ASCII c := obj.Bgetc(l.bin) if c < utf8.RuneSelf { if c == 0 { yyerrorl(int(lexlineno), "illegal NUL byte") return 0 } if c == '\n' && importpkg == nil { lexlineno++ } return rune(c) } // c >= utf8.RuneSelf // uncommon case: non-ASCII var buf [utf8.UTFMax]byte buf[0] = byte(c) buf[1] = byte(obj.Bgetc(l.bin)) i := 2 for ; i < len(buf) && !utf8.FullRune(buf[:i]); i++ { buf[i] = byte(obj.Bgetc(l.bin)) } r, w := utf8.DecodeRune(buf[:i]) if r == utf8.RuneError && w == 1 { // The string conversion here makes a copy for passing // to fmt.Printf, so that buf itself does not escape and // can be allocated on the stack. yyerrorl(int(lexlineno), "illegal UTF-8 sequence % x", string(buf[:i])) } if r == BOM { yyerrorl(int(lexlineno), "Unicode (UTF-8) BOM in middle of file") goto redo } return r }