Example #1
func (in *input) skipContinuationBytes(p int) int {
	if in.bytes == nil {
		for ; p < len(in.str) && !utf8.RuneStart(in.str[p]); p++ {
	} else {
		for ; p < len(in.bytes) && !utf8.RuneStart(in.bytes[p]); p++ {
	return p
Example #2
func ExampleRuneStart() {
	buf := []byte("a界")
	// Output:
	// true
	// true
	// false
Example #3
func (in *input) skipNonStarter(p int) int {
	if in.bytes == nil {
		for ; p < len(in.str) && !utf8.RuneStart(in.str[p]); p++ {
	} else {
		for ; p < len(in.bytes) && !utf8.RuneStart(in.bytes[p]); p++ {
	return p
Example #4
func (rs reverseStrings) Less(i, j int) bool {
	for m, n := len(rs[i])-1, len(rs[j])-1; m >= 0 && n >= 0; m, n = m-1, n-1 {
		if rs[i][m] != rs[j][n] {
			// We want to compare runes, not bytes. So find the start of the
			// current runes and decode them.
			for ; m > 0 && !utf8.RuneStart(rs[i][m]); m-- {
			for ; n > 0 && !utf8.RuneStart(rs[j][n]); n-- {
			ri, _ := utf8.DecodeRuneInString(rs[i][m:])
			rj, _ := utf8.DecodeRuneInString(rs[j][n:])
			return ri < rj
	return len(rs[i]) < len(rs[j])
Example #5
// truncate returns s truncated to the given size,
// avoiding splitting a multibyte UTF-8 sequence.
func truncate(p []byte, size int) []byte {
	if len(p) <= size {
		return p
	p = p[0:size]
	start := size - 1
	r := rune(p[start])
	if r < utf8.RuneSelf {
		return p
	// Find the start of the last character and check
	// whether it's valid.
	lim := size - utf8.UTFMax
	if lim < 0 {
		lim = 0
	for ; start >= lim; start-- {
		if utf8.RuneStart(p[start]) {
	// If we can't find the start of the last character,
	// return the whole lot.
	if start < 0 {
		return p
	r, rsize := utf8.DecodeRune(p[start:size])
	// The last rune was valid, so include it.
	if rsize > 1 {
		return p
	// The last rune was invalid, so lose it.
	return p[0:start]
Example #6
func (b *Buffer) Read(p []byte) (int, error) {
	n := 0
	bl := len(b.buf)
	for {
		r, size := utf8.DecodeRune(p)
		if size == 0 {
		n += size
		p = p[size:]
		b.buf = append(b.buf, r)
	err := b.feed(bl)
	if err != nil {
		return n, err
	// Check if the bytes are utf8 encoded. This is difficult because we
	// can't tell if more runes are coming. E.g. p[0] could be a valid rune
	// start, but it could require another byte, which might never arrive.
	// Can we detect the end of file?
	if len(p) > 0 && !utf8.RuneStart(p[0]) {
		return n, fmt.Errorf("Not utf8 encoded. Invalid rune start %x.", p[0])
	return n, nil
// Read valid UTF-8 content from provided io.Reader.
// If underlying reader starts in the middle of a rune, an error is returned.
// If reader ends in the middle of a rune, the last (invalid) rune is discarded. Note that the
// underlying reader will now start reading from the middle of a rune.
func runeLimitedRead(r io.Reader, p []byte) (int, error) {
	n, err := r.Read(p)
	if n == 0 {
		return n, err

	// If first byte is not a valid rune starting byte, returned error
	if n > 0 && !utf8.RuneStart(p[0]) {
		return 0, errInvalidStartingRune

	// The following code is a lightly modified version of utf8#Valid()
	for i := 0; i < n; {
		if p[i] < utf8.RuneSelf {
			// Skip single byte rune

		r, size := utf8.DecodeRune(p[i:])
		if size == 1 && r == utf8.RuneError {
			return i, err
		i += size

	return n, err
func highlightError(f io.Reader, pos int64) (line int, col int, highlight string) {
	line = 1
	br := bufio.NewReader(f)
	lastLine := ""
	thisLine := new(bytes.Buffer)
	for n := int64(0); n < pos; n++ {
		b, err := br.ReadByte()
		if err != nil {
		if b == '\n' {
			lastLine = thisLine.String()
			col = 1
		} else {
			if utf8.RuneStart(b) {
	if line > 1 {
		highlight += fmt.Sprintf("%5d: %s\n", line-1, lastLine)
	highlight += fmt.Sprintf("%5d: %s\n", line, thisLine.String())
	highlight += fmt.Sprintf("%s^\n", strings.Repeat(" ", col+5))
Example #9
File: ws.go Project: Jacke/WebTerm
func redirToWs(fd int, ws *websocket.Conn) {
	defer func() {
		if r := recover(); r != nil {
			fmt.Fprintf(os.Stderr, "Error occured: %s\n", r)

	var buf [8192]byte
	start, end, buflen := 0, 0, 0
	for {
		switch nr, er := syscall.Read(fd, buf[start:]); {
		case nr < 0:
			fmt.Fprintf(os.Stderr, "error reading from websocket %d with code %d\n", fd, er)
		case nr == 0: // EOF
		case nr > 0:
			buflen = start + nr
			for end = buflen - 1; end >= 0; end-- {
				if utf8.RuneStart(buf[end]) {
					ch, width := utf8.DecodeRune(buf[end:buflen])
					if ch != utf8.RuneError {
						end += width

				if buflen-end >= 6 {
					fmt.Fprintf(os.Stderr, "Invalid UTF-8 sequence in output")
					end = nr

			runes := bytes.Runes(buf[0:end])
			buf_clean := []byte(string(runes))

			nw, ew := ws.Write(buf_clean[:])
			if ew != nil {
				fmt.Fprintf(os.Stderr, "error writing to websocket with code %s\n", ew)

			if nw != len(buf_clean) {
				fmt.Fprintf(os.Stderr, "Written %d instead of expected %d\n", nw, end)

			start = buflen - end

			if start > 0 {
				// copy remaning read bytes from the end to the beginning of a buffer
				// so that we will get normal bytes
				for i := 0; i < start; i++ {
					buf[i] = buf[end+i]
Example #10
func runeToByteOffset(s []byte, offset_c int) (offset_b int) {
	for offset_b = 0; offset_c > 0 && offset_b < len(s); offset_b++ {
		if utf8.RuneStart(s[offset_b]) {
	return offset_b
Example #11
func char_to_byte_offset(s []byte, offset_c int) (offset_b int) {
	for offset_b = 0; offset_c > 0 && offset_b < len(s); offset_b++ {
		if utf8.RuneStart(s[offset_b]) {
	return offset_b
Example #12
func getRuneSize(s string, i int) int {
	runeSize := 1
	for i+runeSize < len(s) && !utf8.RuneStart(s[i+runeSize]) {

	return runeSize
Example #13
// move cursor backwards to the next valid utf8 rune start, or 0
func (this *bytes_iterator) move_backwards() {
	for this.cursor != 0 {
		if utf8.RuneStart(this.char()) {
Example #14
func charToByteOffset(s []byte, offsetC int) (offsetB int) {
	for offsetB = 0; offsetC > 0 && offsetB < len(s); offsetB++ {
		if utf8.RuneStart(s[offsetB]) {
	return offsetB
Example #15
// lastRuneStart returns the runeInfo and position of the last
// rune in buf or the zero runeInfo and -1 if no rune was found.
func lastRuneStart(fd *formInfo, buf []byte) (runeInfo, int) {
	p := len(buf) - 1
	for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
	if p < 0 {
		return runeInfo{0, 0, 0, 0}, -1
	return fd.info(inputBytes(buf), p), p
Example #16
// lastRuneStart returns the runeInfo and position of the last
// rune in buf or the zero runeInfo and -1 if no rune was found.
func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
	p := len(buf) - 1
	for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
	if p < 0 {
		return Properties{}, -1
	return fd.info(inputBytes(buf), p), p
Example #17
// pidx finds the index from which two strings start to differ, plus context.
// It returns the index and ellipsis if the index is greater than 0.
func pidx(a, b string) (i int, prefix string) {
	for ; i < len(a) && i < len(b) && a[i] == b[i]; i++ {
	if i < 8 {
		return 0, ""
	i -= 3 // ensure taking at least one full rune before the difference.
	for k := i - 7; i > k && !utf8.RuneStart(a[i]); i-- {
	return i, "..."
Example #18
func utf8MoveBackwards(file []byte, cursor int) int {
	for {
		if cursor <= 0 {
			return 0
		if utf8.RuneStart(file[cursor]) {
			return cursor
	return 0
Example #19
// DetermineEncoding determines the encoding of an HTML document by examining
// up to the first 1024 bytes of content and the declared Content-Type.
// See http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding
func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding, name string, certain bool) {
	if len(content) > 1024 {
		content = content[:1024]

	for _, b := range boms {
		if bytes.HasPrefix(content, b.bom) {
			e, name = Lookup(b.enc)
			return e, name, true

	if _, params, err := mime.ParseMediaType(contentType); err == nil {
		if cs, ok := params["charset"]; ok {
			if e, name = Lookup(cs); e != nil {
				return e, name, true

	if len(content) > 0 {
		e, name = prescan(content)
		if e != nil {
			return e, name, false

	// Try to detect UTF-8.
	// First eliminate any partial rune at the end.
	for i := len(content) - 1; i >= 0 && i > len(content)-4; i-- {
		b := content[i]
		if b < 0x80 {
		if utf8.RuneStart(b) {
			content = content[:i]
	hasHighBit := false
	for _, c := range content {
		if c >= 0x80 {
			hasHighBit = true
	if hasHighBit && utf8.Valid(content) {
		return encoding.Nop, "utf-8", false

	// TODO: change default depending on user's locale?
	return charmap.Windows1252, "windows-1252", false
Example #20
func main() {
	var x float64
	f(x) // ERROR "byte"
	g(x) // ERROR "uint8"

	// Test across imports.

	var ff fmt.Formatter
	var fs fmt.State
	ff.Format(fs, x) // ERROR "rune"

	utf8.RuneStart(x) // ERROR "byte"
Example #21
// Update location information (counting lines and columns) from a byte slice.
func (location *Location) updateFromBytes(bytes []byte) {
	for _, c := range bytes {
		switch {
		case c == '\r':
			location.Column = 0
		case c == '\n':
			location.Column = 0
		case utf8.RuneStart(c):
Example #22
func encodeBase64LimitChars(source string, limit int) (encoded string, numOfSourceChars int) {
	numOfSourceChars = limit / 4 * 3
	if len(source) <= numOfSourceChars {
		encoded = base64.StdEncoding.EncodeToString([]byte(source))
		numOfSourceChars = len(source)
	} else {
		for numOfSourceChars > 0 && !utf8.RuneStart(source[numOfSourceChars]) {
		if numOfSourceChars > 0 {
			encoded = base64.StdEncoding.EncodeToString([]byte(source[:numOfSourceChars]))
Example #23
// trimIncompleteRune returns b with any trailing
// incomplete rune sliced off.
func trimIncompleteRune(b []byte) []byte {
	i := len(b) - utf8.UTFMax
	if i < 0 {
		i = 0
	lastStart := len(b)
	for ; i < len(b); i++ {
		if r, n := utf8.DecodeRune(b[i:]); r != utf8.RuneError || n > 1 {
			lastStart = len(b)
		if utf8.RuneStart(b[i]) {
			lastStart = i
	return b[0:lastStart]
Example #24
// nextMulti is used for iterating over multi-segment decompositions
// for decomposing normal forms.
func nextMulti(i *Iter) []byte {
	j := 0
	d := i.multiSeg
	// skip first rune
	for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
	for j < len(d) {
		info := i.rb.f.info(input{bytes: d}, j)
		if info.BoundaryBefore() {
			i.multiSeg = d[j:]
			return d[:j]
		j += int(info.size)
	// treat last segment as normal decomposition
	i.next = i.rb.f.nextMain
	return i.next(i)
Example #25
func findWordFollowedBy(by rune, data []byte, from int, allowEmptyKey bool) (start int, end int, found bool) {
	i := bytes.IndexRune(data[from:], by)
	if i == -1 {
		return i, i, false
	i += from
	// loop for all letters before the `by`, stop at the first space
	for j := i - 1; j >= from; j-- {
		if !utf8.RuneStart(data[j]) {
		r, _ := utf8.DecodeRune(data[j:])
		if unicode.IsSpace(r) {
			return j, i, allowEmptyKey || j < i
	return from, i, allowEmptyKey || from < i
Example #26
// Determines if the buffer contains valid UTF8 encoded string data. The buffer is assumed
// to be a prefix of a larger buffer so if the buffer ends with the start of a rune, it
// is still considered valid.
// Basic logic copied from https://golang.org/pkg/unicode/utf8/#Valid
func validUTF8IgnoringPartialTrailingRune(p []byte) bool {
	i := 0
	n := len(p)

	for i < n {
		if p[i] < utf8.RuneSelf {
		} else {
			_, size := utf8.DecodeRune(p[i:])
			if size == 1 {
				// All valid runes of size 1 (those below RuneSelf) were handled above. This must be a RuneError.
				// If we're encountering this error within UTFMax of the end and the current byte could be a
				// valid start, we'll just ignore the assumed partial rune.
				return n-i < utf8.UTFMax && utf8.RuneStart(p[i])
			i += size
	return true
Example #27
// encode takes a string and position in that string and encodes one utf-8
// character. It then returns the encoded string and number of runes in the
// character.
func encode(text []byte, i int) (encodedString string, runeLength int) {
	started := false

	for ; i < len(text) && (!utf8.RuneStart(text[i]) || !started); i++ {
		switch c := text[i]; {
		case c == ' ':
			encodedString += "_"
		case isVchar(c) && c != '=' && c != '?' && c != '_':
			encodedString += string(c)
			encodedString += fmt.Sprintf("=%02X", c)


		started = true

Example #28
// scan matches the longest suffix at the current location in the input
// and returns the number of bytes consumed.
func (s *ctScanner) scan(p int) int {
	pr := p // the p at the rune start
	str := s.s
	states, n := s.states, s.n
	for i := 0; i < n && p < len(str); {
		e := states[i]
		c := str[p]
		// TODO: a significant number of contractions are of a form that
		// cannot match discontiguous UTF-8 in a normalized string. We could let
		// a negative value of e.n mean that we can set s.done = true and avoid
		// the need for additional matches.
		if c >= e.l {
			if e.l == c {
				if e.i != noIndex {
					s.index = int(e.i)
					s.pindex = p
				if e.n != final {
					i, states, n = 0, states[int(e.h)+n:], int(e.n)
					if p >= len(str) || utf8.RuneStart(str[p]) {
						s.states, s.n, pr = states, n, p
				} else {
					s.done = true
					return p
			} else if e.n == final && c <= e.h {
				s.done = true
				s.index = int(c-e.l) + int(e.i)
				s.pindex = p
				return p
	return pr
Example #29
// nextMultiNorm is used for iterating over multi-segment decompositions
// for composing normal forms.
func nextMultiNorm(i *Iter) []byte {
	j := 0
	d := i.multiSeg
	// skip first rune
	for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
	for j < len(d) {
		info := i.rb.f.info(input{bytes: d}, j)
		if info.ccc == 0 {
			i.multiSeg = d[j:]
			return d[:j]
		j += int(info.size)
	i.multiSeg = nil
	i.next = nextComposed
	i.p++ // restore old valud of i.p. See nextComposed.
	if i.p >= i.rb.nsrc {
	return d
Example #30
// Convert a UTF-8 byte sequence into a ISO 8859 byte sequence. The errors returned
// by this function are either UnicodeError, which means that a partial UTF-8 symbol
// or an illegal UTF-8 sequence was found, i.e. either latinx.ILLEGAL, or latinx.PARTIAL.
// When a UnicodeError is returned, success < len(utf_8), and success indicates how
// many bytes that was successfully converted into UTF-8 bytes.
// If this function returns an UnknownRuneError, it means that the charset of the
// Converter has no mapping for a rune (UTF-8 letter) found in the utf_8 array.
func (c *Converter) Encode(utf_8 []byte) (latin []byte, success int, err error) {

	var ok bool
	var latinByte byte
	var offset, size int
	var rne rune
	var errmsg string
	var buf *bytes.Buffer

	buf = bytes.NewBuffer(make([]byte, len(utf_8)))

	for offset < len(utf_8) {

		rne, size = utf8.DecodeRune(utf_8[offset:])

		if rne == utf8.RuneError {
			if utf8.RuneStart(utf_8[offset]) && len(utf_8)-offset < utf8.UTFMax {
				return buf.Bytes(), offset, PARTIAL // UnicodeError
			} else {
				return buf.Bytes(), offset, ILLEGAL // UnicodeError
		} else if rne < utf8.RuneSelf {
		} else {
			latinByte, ok = c.utf8ToLatin[int(rne)]
			if !ok {
				errmsg = fmt.Sprintf("undefined: 0x%X in %s", rne, c.id)
				err = UnknownRuneError(errmsg)
				return buf.Bytes(), offset, err
			offset += size
	return buf.Bytes(), offset, err