func _peek_char(port Obj) Obj { if is_immediate(port) { panic("bad type") } switch v := (*port).(type) { case *InputPort: if v.is_binary { panic("bad port type") } for !utf8.FullRune(v.lookahead[0:v.lookahead_valid]) { n, err := io.ReadFull(v.r, v.lookahead[v.lookahead_valid:v.lookahead_valid+1]) v.lookahead_valid += n switch { case err == os.EOF: return Eof case err != nil: panic("I/O read error") } } cp, _ := utf8.DecodeRune(v.lookahead[0:v.lookahead_valid]) return Make_char(cp) } panic("bad type") }
// ReadRune returns the next UTF-8 encoded code point from the // io.Reader inside r. func (r *readRune) ReadRune() (rune int, size int, err os.Error) { r.buf[0], err = r.readByte() if err != nil { return 0, 0, err } if r.buf[0] < utf8.RuneSelf { // fast check for common ASCII case rune = int(r.buf[0]) return } var n int for n = 1; !utf8.FullRune(r.buf[0:n]); n++ { r.buf[n], err = r.readByte() if err != nil { if err == os.EOF { err = nil break } return } } rune, size = utf8.DecodeRune(r.buf[0:n]) if size < n { // an error r.unread(r.buf[size:n]) } return }
// next reads and returns the next Unicode character. It is designed such // that only a minimal amount of work needs to be done in the common ASCII // case (one test to check for both ASCII and end-of-buffer, and one test // to check for newlines). func (s *Scanner) next() int { ch := int(s.srcBuf[s.srcPos]) if ch >= utf8.RuneSelf { // uncommon case: not ASCII or not enough bytes for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) { // not enough bytes: read some more, but first // save away token text if any if s.tokPos >= 0 { s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos]) s.tokPos = 0 } // move unread bytes to beginning of buffer copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd]) s.srcBufOffset += s.srcPos // read more bytes i := s.srcEnd - s.srcPos n, err := s.src.Read(s.srcBuf[i:bufLen]) s.srcEnd = i + n s.srcPos = 0 s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel if err != nil { if s.srcEnd == 0 { return EOF } if err != os.EOF { s.error(err.String()) break } } } // at least one byte ch = int(s.srcBuf[s.srcPos]) if ch >= utf8.RuneSelf { // uncommon case: not ASCII var width int ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd]) if ch == utf8.RuneError && width == 1 { s.error("illegal UTF-8 encoding") } s.srcPos += width - 1 } } s.srcPos++ s.column++ switch ch { case 0: // implementation restriction for compatibility with other tools s.error("illegal character NUL") case '\n': s.line++ s.column = 0 } return ch }
// ReadRune reads a single UTF-8 encoded Unicode character and returns the // rune and its size in bytes. func (b *Reader) ReadRune() (rune int, size int, err os.Error) { for b.r+utf8.UTFMax > b.w && !utf8.FullRune(b.buf[b.r:b.w]) && b.err == nil { b.fill() } if b.r == b.w { return 0, 0, b.err } rune, size = int(b.buf[b.r]), 1 if rune >= 0x80 { rune, size = utf8.DecodeRune(b.buf[b.r:b.w]) } b.r += size b.lastbyte = int(b.buf[b.r-1]) return rune, size, nil }
// next reads and returns the next Unicode character. It is designed such // that only a minimal amount of work needs to be done in the common ASCII // case (one test to check for both ASCII and end-of-buffer, and one test // to check for newlines). func (s *Scanner) next() int { ch := int(s.srcBuf[s.srcPos]) if ch >= utf8.RuneSelf { // uncommon case: not ASCII or not enough bytes for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) { // not enough bytes: read some more, but first // save away token text if any if s.tokPos >= 0 { s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos]) s.tokPos = 0 } // move unread bytes to beginning of buffer copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd]) s.srcBufOffset += s.srcPos // read more bytes i := s.srcEnd - s.srcPos n, err := s.src.Read(s.srcBuf[i:bufLen]) s.srcEnd = i + n s.srcPos = 0 s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel if err != nil { if s.srcEnd == 0 { return EOF } s.error(err.String()) break } } // at least one byte ch = int(s.srcBuf[s.srcPos]) if ch >= utf8.RuneSelf { // uncommon case: not ASCII var width int ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd]) s.srcPos += width - 1 } } s.srcPos++ s.column++ if ch == '\n' { s.line++ s.column = 0 } return ch }
// ReadRune reads a single UTF-8 encoded Unicode character and returns the // rune and its size in bytes. func (b *Reader) ReadRune() (r rune, size int, err error) { for b.r+utf8.UTFMax > b.w && !utf8.FullRune(b.buf[b.r:b.w]) && b.err == nil { b.fill() } b.lastRuneSize = -1 if b.r == b.w { return 0, 0, b.readErr() } r, size = rune(b.buf[b.r]), 1 if r >= 0x80 { r, size = utf8.DecodeRune(b.buf[b.r:b.w]) } b.r += size b.lastByte = int(b.buf[b.r-1]) b.lastRuneSize = size return r, size, nil }
func isText(b []byte) bool { for len(b) > 0 && utf8.FullRune(b) { if rune, size := utf8.DecodeRune(b); size == 1 && rune == utf8.RuneError { return false } else { if 0x80 <= rune && rune <= 0x9F { return false } if rune < ' ' { switch rune { case '\n', '\r', '\t': default: return false } } b = b[size:] } } return true }
// Heuristic: b is text if it is valid UTF-8 and doesn't // contain any unprintable ASCII or Unicode characters. func isText(b []byte) bool { for len(b) > 0 && utf8.FullRune(b) { rune, size := utf8.DecodeRune(b) if size == 1 && rune == utf8.RuneError { // decoding error return false } if 0x7F <= rune && rune <= 0x9F { return false } if rune < ' ' { switch rune { case '\n', '\r', '\t': // okay default: // binary garbage return false } } b = b[size:] } return true }
// next reads and returns the next Unicode character. It is designed such // that only a minimal amount of work needs to be done in the common ASCII // case (one test to check for both ASCII and end-of-buffer, and one test // to check for newlines). func (s *Scanner) next() rune { ch, width := rune(s.srcBuf[s.srcPos]), 1 if ch >= utf8.RuneSelf { // uncommon case: not ASCII or not enough bytes for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) { // not enough bytes: read some more, but first // save away token text if any if s.tokPos >= 0 { s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos]) s.tokPos = 0 // s.tokEnd is set by Scan() } // move unread bytes to beginning of buffer copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd]) s.srcBufOffset += s.srcPos // read more bytes // (an io.Reader must return os.EOF when it reaches // the end of what it is reading - simply returning // n == 0 will make this loop retry forever; but the // error is in the reader implementation in that case) i := s.srcEnd - s.srcPos n, err := s.src.Read(s.srcBuf[i:bufLen]) s.srcPos = 0 s.srcEnd = i + n s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel if err != nil { if s.srcEnd == 0 { if s.lastCharLen > 0 { // previous character was not EOF s.column++ } s.lastCharLen = 0 return EOF } if err != io.EOF { s.error(err.Error()) } // If err == EOF, we won't be getting more // bytes; break to avoid infinite loop. If // err is something else, we don't know if // we can get more bytes; thus also break. break } } // at least one byte ch = rune(s.srcBuf[s.srcPos]) if ch >= utf8.RuneSelf { // uncommon case: not ASCII ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd]) if ch == utf8.RuneError && width == 1 { // advance for correct error position s.srcPos += width s.lastCharLen = width s.column++ s.error("illegal UTF-8 encoding") return ch } } } // advance s.srcPos += width s.lastCharLen = width s.column++ // special situations switch ch { case 0: // implementation restriction for compatibility with other tools s.error("illegal character NUL") case '\n': s.line++ s.lastLineLen = s.column s.column = 0 } return ch }