func (in *input) skipContinuationBytes(p int) int { if in.bytes == nil { for ; p < len(in.str) && !utf8.RuneStart(in.str[p]); p++ { } } else { for ; p < len(in.bytes) && !utf8.RuneStart(in.bytes[p]); p++ { } } return p }
func ExampleRuneStart() { buf := []byte("a界") fmt.Println(utf8.RuneStart(buf[0])) fmt.Println(utf8.RuneStart(buf[1])) fmt.Println(utf8.RuneStart(buf[2])) // Output: // true // true // false }
func (in *input) skipNonStarter(p int) int { if in.bytes == nil { for ; p < len(in.str) && !utf8.RuneStart(in.str[p]); p++ { } } else { for ; p < len(in.bytes) && !utf8.RuneStart(in.bytes[p]); p++ { } } return p }
func (rs reverseStrings) Less(i, j int) bool { for m, n := len(rs[i])-1, len(rs[j])-1; m >= 0 && n >= 0; m, n = m-1, n-1 { if rs[i][m] != rs[j][n] { // We want to compare runes, not bytes. So find the start of the // current runes and decode them. for ; m > 0 && !utf8.RuneStart(rs[i][m]); m-- { } for ; n > 0 && !utf8.RuneStart(rs[j][n]); n-- { } ri, _ := utf8.DecodeRuneInString(rs[i][m:]) rj, _ := utf8.DecodeRuneInString(rs[j][n:]) return ri < rj } } return len(rs[i]) < len(rs[j]) }
// truncate returns s truncated to the given size, // avoiding splitting a multibyte UTF-8 sequence. func truncate(p []byte, size int) []byte { if len(p) <= size { return p } p = p[0:size] start := size - 1 r := rune(p[start]) if r < utf8.RuneSelf { return p } // Find the start of the last character and check // whether it's valid. lim := size - utf8.UTFMax if lim < 0 { lim = 0 } for ; start >= lim; start-- { if utf8.RuneStart(p[start]) { break } } // If we can't find the start of the last character, // return the whole lot. if start < 0 { return p } r, rsize := utf8.DecodeRune(p[start:size]) // The last rune was valid, so include it. if rsize > 1 { return p } // The last rune was invalid, so lose it. return p[0:start] }
func (b *Buffer) Read(p []byte) (int, error) { n := 0 bl := len(b.buf) for { r, size := utf8.DecodeRune(p) if size == 0 { break } n += size p = p[size:] b.buf = append(b.buf, r) } err := b.feed(bl) if err != nil { return n, err } // Check if the bytes are utf8 encoded. This is difficult because we // can't tell if more runes are coming. E.g. p[0] could be a valid rune // start, but it could require another byte, which might never arrive. // Can we detect the end of file? if len(p) > 0 && !utf8.RuneStart(p[0]) { return n, fmt.Errorf("Not utf8 encoded. Invalid rune start %x.", p[0]) } return n, nil }
// Read valid UTF-8 content from provided io.Reader. // If underlying reader starts in the middle of a rune, an error is returned. // If reader ends in the middle of a rune, the last (invalid) rune is discarded. Note that the // underlying reader will now start reading from the middle of a rune. func runeLimitedRead(r io.Reader, p []byte) (int, error) { n, err := r.Read(p) if n == 0 { return n, err } // If first byte is not a valid rune starting byte, returned error if n > 0 && !utf8.RuneStart(p[0]) { return 0, errInvalidStartingRune } // The following code is a lightly modified version of utf8#Valid() for i := 0; i < n; { if p[i] < utf8.RuneSelf { // Skip single byte rune i++ continue } r, size := utf8.DecodeRune(p[i:]) if size == 1 && r == utf8.RuneError { return i, err } i += size } return n, err }
func highlightError(f io.Reader, pos int64) (line int, col int, highlight string) { line = 1 br := bufio.NewReader(f) lastLine := "" thisLine := new(bytes.Buffer) for n := int64(0); n < pos; n++ { b, err := br.ReadByte() if err != nil { break } if b == '\n' { lastLine = thisLine.String() thisLine.Reset() line++ col = 1 } else { if utf8.RuneStart(b) { col++ } thisLine.WriteByte(b) } } if line > 1 { highlight += fmt.Sprintf("%5d: %s\n", line-1, lastLine) } highlight += fmt.Sprintf("%5d: %s\n", line, thisLine.String()) highlight += fmt.Sprintf("%s^\n", strings.Repeat(" ", col+5)) return }
func redirToWs(fd int, ws *websocket.Conn) { defer func() { if r := recover(); r != nil { fmt.Fprintf(os.Stderr, "Error occured: %s\n", r) runtime.Goexit() } }() var buf [8192]byte start, end, buflen := 0, 0, 0 for { switch nr, er := syscall.Read(fd, buf[start:]); { case nr < 0: fmt.Fprintf(os.Stderr, "error reading from websocket %d with code %d\n", fd, er) return case nr == 0: // EOF return case nr > 0: buflen = start + nr for end = buflen - 1; end >= 0; end-- { if utf8.RuneStart(buf[end]) { ch, width := utf8.DecodeRune(buf[end:buflen]) if ch != utf8.RuneError { end += width } break } if buflen-end >= 6 { fmt.Fprintf(os.Stderr, "Invalid UTF-8 sequence in output") end = nr break } } runes := bytes.Runes(buf[0:end]) buf_clean := []byte(string(runes)) nw, ew := ws.Write(buf_clean[:]) if ew != nil { fmt.Fprintf(os.Stderr, "error writing to websocket with code %s\n", ew) return } if nw != len(buf_clean) { fmt.Fprintf(os.Stderr, "Written %d instead of expected %d\n", nw, end) } start = buflen - end if start > 0 { // copy remaning read bytes from the end to the beginning of a buffer // so that we will get normal bytes for i := 0; i < start; i++ { buf[i] = buf[end+i] } } } } }
func runeToByteOffset(s []byte, offset_c int) (offset_b int) { for offset_b = 0; offset_c > 0 && offset_b < len(s); offset_b++ { if utf8.RuneStart(s[offset_b]) { offset_c-- } } return offset_b }
func char_to_byte_offset(s []byte, offset_c int) (offset_b int) { for offset_b = 0; offset_c > 0 && offset_b < len(s); offset_b++ { if utf8.RuneStart(s[offset_b]) { offset_c-- } } return offset_b }
func getRuneSize(s string, i int) int { runeSize := 1 for i+runeSize < len(s) && !utf8.RuneStart(s[i+runeSize]) { runeSize++ } return runeSize }
// move cursor backwards to the next valid utf8 rune start, or 0 func (this *bytes_iterator) move_backwards() { for this.cursor != 0 { this.cursor-- if utf8.RuneStart(this.char()) { return } } }
func charToByteOffset(s []byte, offsetC int) (offsetB int) { for offsetB = 0; offsetC > 0 && offsetB < len(s); offsetB++ { if utf8.RuneStart(s[offsetB]) { offsetC-- } } return offsetB }
// lastRuneStart returns the runeInfo and position of the last // rune in buf or the zero runeInfo and -1 if no rune was found. func lastRuneStart(fd *formInfo, buf []byte) (runeInfo, int) { p := len(buf) - 1 for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- { } if p < 0 { return runeInfo{0, 0, 0, 0}, -1 } return fd.info(inputBytes(buf), p), p }
// lastRuneStart returns the runeInfo and position of the last // rune in buf or the zero runeInfo and -1 if no rune was found. func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) { p := len(buf) - 1 for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- { } if p < 0 { return Properties{}, -1 } return fd.info(inputBytes(buf), p), p }
// pidx finds the index from which two strings start to differ, plus context. // It returns the index and ellipsis if the index is greater than 0. func pidx(a, b string) (i int, prefix string) { for ; i < len(a) && i < len(b) && a[i] == b[i]; i++ { } if i < 8 { return 0, "" } i -= 3 // ensure taking at least one full rune before the difference. for k := i - 7; i > k && !utf8.RuneStart(a[i]); i-- { } return i, "..." }
func utf8MoveBackwards(file []byte, cursor int) int { for { cursor-- if cursor <= 0 { return 0 } if utf8.RuneStart(file[cursor]) { return cursor } } return 0 }
// DetermineEncoding determines the encoding of an HTML document by examining // up to the first 1024 bytes of content and the declared Content-Type. // // See http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding, name string, certain bool) { if len(content) > 1024 { content = content[:1024] } for _, b := range boms { if bytes.HasPrefix(content, b.bom) { e, name = Lookup(b.enc) return e, name, true } } if _, params, err := mime.ParseMediaType(contentType); err == nil { if cs, ok := params["charset"]; ok { if e, name = Lookup(cs); e != nil { return e, name, true } } } if len(content) > 0 { e, name = prescan(content) if e != nil { return e, name, false } } // Try to detect UTF-8. // First eliminate any partial rune at the end. for i := len(content) - 1; i >= 0 && i > len(content)-4; i-- { b := content[i] if b < 0x80 { break } if utf8.RuneStart(b) { content = content[:i] break } } hasHighBit := false for _, c := range content { if c >= 0x80 { hasHighBit = true break } } if hasHighBit && utf8.Valid(content) { return encoding.Nop, "utf-8", false } // TODO: change default depending on user's locale? return charmap.Windows1252, "windows-1252", false }
func main() { var x float64 f(x) // ERROR "byte" g(x) // ERROR "uint8" // Test across imports. var ff fmt.Formatter var fs fmt.State ff.Format(fs, x) // ERROR "rune" utf8.RuneStart(x) // ERROR "byte" }
// Update location information (counting lines and columns) from a byte slice. func (location *Location) updateFromBytes(bytes []byte) { for _, c := range bytes { switch { case c == '\r': location.Column = 0 case c == '\n': location.Column = 0 location.Line++ case utf8.RuneStart(c): location.Column++ } } }
func encodeBase64LimitChars(source string, limit int) (encoded string, numOfSourceChars int) { numOfSourceChars = limit / 4 * 3 if len(source) <= numOfSourceChars { encoded = base64.StdEncoding.EncodeToString([]byte(source)) numOfSourceChars = len(source) } else { for numOfSourceChars > 0 && !utf8.RuneStart(source[numOfSourceChars]) { numOfSourceChars-- } if numOfSourceChars > 0 { encoded = base64.StdEncoding.EncodeToString([]byte(source[:numOfSourceChars])) } } return }
// trimIncompleteRune returns b with any trailing // incomplete rune sliced off. func trimIncompleteRune(b []byte) []byte { i := len(b) - utf8.UTFMax if i < 0 { i = 0 } lastStart := len(b) for ; i < len(b); i++ { if r, n := utf8.DecodeRune(b[i:]); r != utf8.RuneError || n > 1 { lastStart = len(b) continue } if utf8.RuneStart(b[i]) { lastStart = i } } return b[0:lastStart] }
// nextMulti is used for iterating over multi-segment decompositions // for decomposing normal forms. func nextMulti(i *Iter) []byte { j := 0 d := i.multiSeg // skip first rune for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ { } for j < len(d) { info := i.rb.f.info(input{bytes: d}, j) if info.BoundaryBefore() { i.multiSeg = d[j:] return d[:j] } j += int(info.size) } // treat last segment as normal decomposition i.next = i.rb.f.nextMain return i.next(i) }
func findWordFollowedBy(by rune, data []byte, from int, allowEmptyKey bool) (start int, end int, found bool) { i := bytes.IndexRune(data[from:], by) if i == -1 { return i, i, false } i += from // loop for all letters before the `by`, stop at the first space for j := i - 1; j >= from; j-- { if !utf8.RuneStart(data[j]) { continue } r, _ := utf8.DecodeRune(data[j:]) if unicode.IsSpace(r) { j++ return j, i, allowEmptyKey || j < i } } return from, i, allowEmptyKey || from < i }
// Determines if the buffer contains valid UTF8 encoded string data. The buffer is assumed // to be a prefix of a larger buffer so if the buffer ends with the start of a rune, it // is still considered valid. // // Basic logic copied from https://golang.org/pkg/unicode/utf8/#Valid func validUTF8IgnoringPartialTrailingRune(p []byte) bool { i := 0 n := len(p) for i < n { if p[i] < utf8.RuneSelf { i++ } else { _, size := utf8.DecodeRune(p[i:]) if size == 1 { // All valid runes of size 1 (those below RuneSelf) were handled above. This must be a RuneError. // If we're encountering this error within UTFMax of the end and the current byte could be a // valid start, we'll just ignore the assumed partial rune. return n-i < utf8.UTFMax && utf8.RuneStart(p[i]) } i += size } } return true }
// encode takes a string and position in that string and encodes one utf-8 // character. It then returns the encoded string and number of runes in the // character. func encode(text []byte, i int) (encodedString string, runeLength int) { started := false for ; i < len(text) && (!utf8.RuneStart(text[i]) || !started); i++ { switch c := text[i]; { case c == ' ': encodedString += "_" case isVchar(c) && c != '=' && c != '?' && c != '_': encodedString += string(c) default: encodedString += fmt.Sprintf("=%02X", c) } runeLength++ started = true } return }
// scan matches the longest suffix at the current location in the input // and returns the number of bytes consumed. func (s *ctScanner) scan(p int) int { pr := p // the p at the rune start str := s.s states, n := s.states, s.n for i := 0; i < n && p < len(str); { e := states[i] c := str[p] // TODO: a significant number of contractions are of a form that // cannot match discontiguous UTF-8 in a normalized string. We could let // a negative value of e.n mean that we can set s.done = true and avoid // the need for additional matches. if c >= e.l { if e.l == c { p++ if e.i != noIndex { s.index = int(e.i) s.pindex = p } if e.n != final { i, states, n = 0, states[int(e.h)+n:], int(e.n) if p >= len(str) || utf8.RuneStart(str[p]) { s.states, s.n, pr = states, n, p } } else { s.done = true return p } continue } else if e.n == final && c <= e.h { p++ s.done = true s.index = int(c-e.l) + int(e.i) s.pindex = p return p } } i++ } return pr }
// nextMultiNorm is used for iterating over multi-segment decompositions // for composing normal forms. func nextMultiNorm(i *Iter) []byte { j := 0 d := i.multiSeg // skip first rune for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ { } for j < len(d) { info := i.rb.f.info(input{bytes: d}, j) if info.ccc == 0 { i.multiSeg = d[j:] return d[:j] } j += int(info.size) } i.multiSeg = nil i.next = nextComposed i.p++ // restore old valud of i.p. See nextComposed. if i.p >= i.rb.nsrc { i.setDone() } return d }
// Convert a UTF-8 byte sequence into a ISO 8859 byte sequence. The errors returned // by this function are either UnicodeError, which means that a partial UTF-8 symbol // or an illegal UTF-8 sequence was found, i.e. either latinx.ILLEGAL, or latinx.PARTIAL. // When a UnicodeError is returned, success < len(utf_8), and success indicates how // many bytes that was successfully converted into UTF-8 bytes. // If this function returns an UnknownRuneError, it means that the charset of the // Converter has no mapping for a rune (UTF-8 letter) found in the utf_8 array. func (c *Converter) Encode(utf_8 []byte) (latin []byte, success int, err error) { var ok bool var latinByte byte var offset, size int var rne rune var errmsg string var buf *bytes.Buffer buf = bytes.NewBuffer(make([]byte, len(utf_8))) buf.Reset() for offset < len(utf_8) { rne, size = utf8.DecodeRune(utf_8[offset:]) if rne == utf8.RuneError { if utf8.RuneStart(utf_8[offset]) && len(utf_8)-offset < utf8.UTFMax { return buf.Bytes(), offset, PARTIAL // UnicodeError } else { return buf.Bytes(), offset, ILLEGAL // UnicodeError } } else if rne < utf8.RuneSelf { buf.WriteByte(utf_8[offset]) offset++ } else { latinByte, ok = c.utf8ToLatin[int(rne)] if !ok { errmsg = fmt.Sprintf("undefined: 0x%X in %s", rne, c.id) err = UnknownRuneError(errmsg) return buf.Bytes(), offset, err } buf.WriteByte(latinByte) offset += size } } return buf.Bytes(), offset, err }