Exemple #1
// Trim returns a slice of the string s, with all leading and trailing white space
// removed, as defined by Unicode.
func TrimSpace(s []byte) []byte {
	start, end := 0, len(s)
	for start < end {
		wid := 1
		rune := int(s[start])
		if rune >= utf8.RuneSelf {
			rune, wid = utf8.DecodeRune(s[start:end])
		if !unicode.IsSpace(rune) {
		start += wid
	for start < end {
		wid := 1
		rune := int(s[end-1])
		if rune >= utf8.RuneSelf {
			// Back up carefully looking for beginning of rune. Mustn't pass start.
			for wid = 2; start <= end-wid && !utf8.RuneStart(s[end-wid]); wid++ {
			if start > end-wid { // invalid UTF-8 sequence; stop processing
				return s[start:end]
			rune, wid = utf8.DecodeRune(s[end-wid : end])
		if !unicode.IsSpace(rune) {
		end -= wid
	return s[start:end]
// FieldsFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
// It splits the array s at each run of code points c satisfying f(c) and
// returns a slice of subarrays of s.  If no code points in s satisfy f(c), an
// empty slice is returned.
func FieldsFunc(s []byte, f func(int) bool) [][]byte {
	n := 0
	inField := false
	for i := 0; i < len(s); {
		rune, size := utf8.DecodeRune(s[i:])
		wasInField := inField
		inField = !f(rune)
		if inField && !wasInField {
		i += size

	a := make([][]byte, n)
	na := 0
	fieldStart := -1
	for i := 0; i <= len(s) && na < n; {
		rune, size := utf8.DecodeRune(s[i:])
		if fieldStart < 0 && size > 0 && !f(rune) {
			fieldStart = i
			i += size
		if fieldStart >= 0 && (size == 0 || f(rune)) {
			a[na] = s[fieldStart:i]
			fieldStart = -1
		if size == 0 {
		i += size
	return a[0:na]
Exemple #3
func (self *scanner) nextWord() (word tok, err os.Error) {
	if self.index >= len(self.content) {
		err = os.NewError("EOF")

	for self.index < len(self.content) {
		r, l := utf8.DecodeRune(self.content[self.index:])
		if !unicode.IsSpace(r) || r == '\n' {
		self.index += l
	j, ttype, inchar, incode := self.index, other, false, 0
	for self.index < len(self.content) {
		r, l := utf8.DecodeRune(self.content[self.index:])
		if r == '\'' {
			inchar = !inchar
		if self.index == j {
			switch {
			case unicode.IsUpper(r):
				ttype = nonterm
			case r == '\n':
				ttype = newline
			case r == ':':
				ttype = begindef
			case r == ';':
				ttype = enddef
			case r == '|':
				ttype = alternate
			case r == '{' && memorizeTerms:
				ttype = code
				ttype = term
		} else if incode > 0 && r == '{' {
		} else if incode > 0 && r == '}' {
		if incode == 0 && !inchar && unicode.IsSpace(r) {
		self.index += l
	token := string(self.content[j:self.index])
	if ttype == newline {
		token = ""
	word = tok{token, ttype}
Exemple #4
// EqualFold reports whether s and t, interpreted as UTF-8 strings,
// are equal under Unicode case-folding.
func EqualFold(s, t []byte) bool {
	for len(s) != 0 && len(t) != 0 {
		// Extract first rune from each.
		var sr, tr rune
		if s[0] < utf8.RuneSelf {
			sr, s = rune(s[0]), s[1:]
		} else {
			r, size := utf8.DecodeRune(s)
			sr, s = r, s[size:]
		if t[0] < utf8.RuneSelf {
			tr, t = rune(t[0]), t[1:]
		} else {
			r, size := utf8.DecodeRune(t)
			tr, t = r, t[size:]

		// If they match, keep going; if not, return false.

		// Easy case.
		if tr == sr {

		// Make sr < tr to simplify what follows.
		if tr < sr {
			tr, sr = sr, tr
		// Fast check for ASCII.
		if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' {
			// ASCII, and sr is upper case.  tr must be lower case.
			if tr == sr+'a'-'A' {
			return false

		// General case.  SimpleFold(x) returns the next equivalent rune > x
		// or wraps around to smaller values.
		r := unicode.SimpleFold(sr)
		for r != sr && r < tr {
			r = unicode.SimpleFold(r)
		if r == tr {
		return false

	// One string is empty.  Are both?
	return len(s) == len(t)
Exemple #5
func (s inputBytes) hangul(p int) uint32 {
	if !isHangul(s[p:]) {
		return 0
	rune, _ := utf8.DecodeRune(s[p:])
	return uint32(rune)
Exemple #6
// insert inserts the given rune in the buffer ordered by CCC.
// It returns true if the buffer was large enough to hold the decomposed rune.
func (rb *reorderBuffer) insert(src []byte, info runeInfo) bool {
	if info.size == 3 && isHangul(src) {
		rune, _ := utf8.DecodeRune(src)
		return rb.decomposeHangul(uint32(rune))
	if info.flags.hasDecomposition() {
		dcomp := rb.f.decompose(src)
		for i := 0; i < len(dcomp); {
			info = rb.f.info(dcomp[i:])
			pos := rb.nbyte
			if !rb.insertOrdered(info) {
				return false
			end := i + int(info.size)
			copy(rb.byte[pos:], dcomp[i:end])
			i = end
	} else {
		pos := rb.nbyte
		if !rb.insertOrdered(info) {
			return false
		copy(rb.byte[pos:], src[:info.size])
	return true
Exemple #7
func _peek_char(port Obj) Obj {
	if is_immediate(port) {
		panic("bad type")
	switch v := (*port).(type) {
	case *InputPort:
		if v.is_binary {
			panic("bad port type")
		for !utf8.FullRune(v.lookahead[0:v.lookahead_valid]) {
			n, err := io.ReadFull(v.r,
			v.lookahead_valid += n
			switch {
			case err == os.EOF:
				return Eof
			case err != nil:
				panic("I/O read error")
		cp, _ := utf8.DecodeRune(v.lookahead[0:v.lookahead_valid])
		return Make_char(cp)
	panic("bad type")
Exemple #8
// Read the next Unicode char into S.ch.
// S.ch < 0 means end-of-file.
func (S *Scanner) next() {
	if S.rdOffset < len(S.src) {
		S.offset = S.rdOffset
		if S.ch == '\n' {
			S.lineOffset = S.offset
		r, w := int(S.src[S.rdOffset]), 1
		switch {
		case r == 0:
			S.error(S.offset, "illegal character NUL")
		case r >= 0x80:
			// not ASCII
			r, w = utf8.DecodeRune(S.src[S.rdOffset:])
			if r == utf8.RuneError && w == 1 {
				S.error(S.offset, "illegal UTF-8 encoding")
		S.rdOffset += w
		S.ch = r
	} else {
		S.offset = len(S.src)
		if S.ch == '\n' {
			S.lineOffset = S.offset
		S.ch = -1 // eof
// Map returns a copy of the byte array s with all its characters modified
// according to the mapping function. If mapping returns a negative value, the character is
// dropped from the string with no replacement.  The characters in s and the
// output are interpreted as UTF-8-encoded Unicode code points.
func Map(mapping func(rune int) int, s []byte) []byte {
	// In the worst case, the array can grow when mapped, making
	// things unpleasant.  But it's so rare we barge in assuming it's
	// fine.  It could also shrink but that falls out naturally.
	maxbytes := len(s) // length of b
	nbytes := 0        // number of bytes encoded in b
	b := make([]byte, maxbytes)
	for i := 0; i < len(s); {
		wid := 1
		rune := int(s[i])
		if rune >= utf8.RuneSelf {
			rune, wid = utf8.DecodeRune(s[i:])
		rune = mapping(rune)
		if rune >= 0 {
			if nbytes+utf8.RuneLen(rune) > maxbytes {
				// Grow the buffer.
				maxbytes = maxbytes*2 + utf8.UTFMax
				nb := make([]byte, maxbytes)
				copy(nb, b[0:nbytes])
				b = nb
			nbytes += utf8.EncodeRune(b[nbytes:maxbytes], rune)
		i += wid
	return b[0:nbytes]
Exemple #10
func (c *AutoCompleteContext) deduceDecl(file []byte, cursor int) *DeclApropos {
	orig := cursor

	if cursor < 0 {
		return nil
	if cursor == 0 {
		return &DeclApropos{nil, ""}

	// figure out what is just before the cursor
	cursor = utf8MoveBackwards(file, cursor)
	if file[cursor] == '.' {
		// we're '<whatever>.'
		// figure out decl, Parital is ""
		return c.deduceExpr(file[:cursor], "")
	} else {
		letter, _ := utf8.DecodeRune(file[cursor:])
		if isIdent(letter) {
			// we're '<whatever>.<ident>'
			// parse <ident> as Partial and figure out decl
			cursor = skipIdent(file, cursor)
			partial := string(file[cursor+1 : orig])
			if file[cursor] == '.' {
				return c.deduceExpr(file[:cursor], partial)
			} else {
				return &DeclApropos{nil, partial}

	return &DeclApropos{nil, ""}
Exemple #11
// Read the next Unicode char into S.ch.
// S.ch < 0 means end-of-file.
func (S *Scanner) next() {
	if S.offset < len(S.src) {
		S.pos.Offset = S.offset
		if S.ch == '\n' {
			// next character starts a new line
			S.pos.Column = 1
		r, w := int(S.src[S.offset]), 1
		switch {
		case r == 0:
			S.error(S.pos, "illegal character NUL")
		case r >= 0x80:
			// not ASCII
			r, w = utf8.DecodeRune(S.src[S.offset:])
			if r == utf8.RuneError && w == 1 {
				S.error(S.pos, "illegal UTF-8 encoding")
		S.offset += w
		S.ch = r
	} else {
		S.pos.Offset = len(S.src)
		S.ch = -1 // eof
Exemple #12
func findExpr(file []byte) []byte {
	const (
		LAST_NONE = iota
	last := LAST_NONE
	cursor := len(file)
	cursor = utf8MoveBackwards(file, cursor)
	for {
		c := file[cursor]
		letter, _ := utf8.DecodeRune(file[cursor:])
		switch c {
		case '.':
			cursor = utf8MoveBackwards(file, cursor)
			last = LAST_DOT
		case ')', ']':
			if last == LAST_IDENT {
				break loop
			cursor = utf8MoveBackwards(file, skipToPair(file, cursor))
			last = LAST_PAREN
			if isIdent(letter) {
				cursor = skipIdent(file, cursor)
				last = LAST_IDENT
			} else {
				break loop
	return file[cursor+1:]
// Replace returns a copy of the slice s with the first n
// non-overlapping instances of old replaced by new.
// If n < 0, there is no limit on the number of replacements.
func Replace(s, old, new []byte, n int) []byte {
	if n == 0 {
		return s // avoid allocation
	// Compute number of replacements.
	if m := Count(s, old); m == 0 {
		return s // avoid allocation
	} else if n <= 0 || m < n {
		n = m

	// Apply replacements to buffer.
	t := make([]byte, len(s)+n*(len(new)-len(old)))
	w := 0
	start := 0
	for i := 0; i < n; i++ {
		j := start
		if len(old) == 0 {
			if i > 0 {
				_, wid := utf8.DecodeRune(s[start:])
				j += wid
		} else {
			j += Index(s[start:], old)
		w += copy(t[w:], s[start:j])
		w += copy(t[w:], new)
		start = j + len(old)
	w += copy(t[w:], s[start:])
	return t[0:w]
Exemple #14
// ReadRune returns the next UTF-8 encoded code point from the
// io.Reader inside r.
func (r *readRune) ReadRune() (rune int, size int, err os.Error) {
	r.buf[0], err = r.readByte()
	if err != nil {
		return 0, 0, err
	if r.buf[0] < utf8.RuneSelf { // fast check for common ASCII case
		rune = int(r.buf[0])
	var n int
	for n = 1; !utf8.FullRune(r.buf[0:n]); n++ {
		r.buf[n], err = r.readByte()
		if err != nil {
			if err == os.EOF {
				err = nil
	rune, size = utf8.DecodeRune(r.buf[0:n])
	if size < n { // an error
// next reads and returns the next Unicode character. It is designed such
// that only a minimal amount of work needs to be done in the common ASCII
// case (one test to check for both ASCII and end-of-buffer, and one test
// to check for newlines).
func (s *Scanner) next() int {
	ch := int(s.srcBuf[s.srcPos])

	if ch >= utf8.RuneSelf {
		// uncommon case: not ASCII or not enough bytes
		for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) {
			// not enough bytes: read some more, but first
			// save away token text if any
			if s.tokPos >= 0 {
				s.tokPos = 0
			// move unread bytes to beginning of buffer
			copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd])
			s.srcBufOffset += s.srcPos
			// read more bytes
			i := s.srcEnd - s.srcPos
			n, err := s.src.Read(s.srcBuf[i:bufLen])
			s.srcEnd = i + n
			s.srcPos = 0
			s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel
			if err != nil {
				if s.srcEnd == 0 {
					return EOF
				if err != os.EOF {
		// at least one byte
		ch = int(s.srcBuf[s.srcPos])
		if ch >= utf8.RuneSelf {
			// uncommon case: not ASCII
			var width int
			ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd])
			if ch == utf8.RuneError && width == 1 {
				s.error("illegal UTF-8 encoding")
			s.srcPos += width - 1

	switch ch {
	case 0:
		// implementation restriction for compatibility with other tools
		s.error("illegal character NUL")
	case '\n':
		s.column = 0

	return ch
Exemple #16
func (i *inputBytes) step(pos int) (rune, int) {
	if pos < len(i.str) {
		c := i.str[pos]
		if c < utf8.RuneSelf {
			return rune(c), 1
		return utf8.DecodeRune(i.str[pos:])
	return endOfText, 0
Exemple #17
func (i *inputBytes) context(pos int) syntax.EmptyOp {
	r1, r2 := endOfText, endOfText
	if pos > 0 && pos <= len(i.str) {
		r1, _ = utf8.DecodeLastRune(i.str[:pos])
	if pos < len(i.str) {
		r2, _ = utf8.DecodeRune(i.str[pos:])
	return syntax.EmptyOpContext(r1, r2)
// TrimRightFunc returns a subslice of s by slicing off all trailing UTF-8
// encoded Unicode code points c that satisfy f(c).
func TrimRightFunc(s []byte, f func(r int) bool) []byte {
	i := lastIndexFunc(s, f, false)
	if i >= 0 && s[i] >= utf8.RuneSelf {
		_, wid := utf8.DecodeRune(s[i:])
		i += wid
	} else {
	return s[0:i]
// IndexRune interprets s as a sequence of UTF-8-encoded Unicode code points.
// It returns the byte index of the first occurrence in s of the given rune.
// It returns -1 if rune is not present in s.
func IndexRune(s []byte, rune int) int {
	for i := 0; i < len(s); {
		r, size := utf8.DecodeRune(s[i:])
		if r == rune {
			return i
		i += size
	return -1
// Runes returns a slice of runes (Unicode code points) equivalent to s.
func Runes(s []byte) []int {
	t := make([]int, utf8.RuneCount(s))
	i := 0
	for len(s) > 0 {
		r, l := utf8.DecodeRune(s)
		t[i] = r
		s = s[l:]
	return t
Exemple #21
func main() {
	var chars [6]int
	chars[0] = 'a'
	chars[1] = 'b'
	chars[2] = 'c'
	chars[3] = '\u65e5'
	chars[4] = '\u672c'
	chars[5] = '\u8a9e'
	s := ""
	for i := 0; i < 6; i++ {
		s += string(chars[i])
	var l = len(s)
	for w, i, j := 0, 0, 0; i < l; i += w {
		var r int
		r, w = utf8.DecodeRuneInString(s[i:len(s)])
		if w == 0 {
			panic("zero width in string")
		if r != chars[j] {
			panic("wrong value from string")
	// encoded as bytes:  'a' 'b' 'c' e6 97 a5 e6 9c ac e8 aa 9e
	const L = 12
	if L != l {
		panic("wrong length constructing array")
	a := make([]byte, L)
	a[0] = 'a'
	a[1] = 'b'
	a[2] = 'c'
	a[3] = 0xe6
	a[4] = 0x97
	a[5] = 0xa5
	a[6] = 0xe6
	a[7] = 0x9c
	a[8] = 0xac
	a[9] = 0xe8
	a[10] = 0xaa
	a[11] = 0x9e
	for w, i, j := 0, 0, 0; i < L; i += w {
		var r int
		r, w = utf8.DecodeRune(a[i:L])
		if w == 0 {
			panic("zero width in bytes")
		if r != chars[j] {
			panic("wrong value from bytes")
Exemple #22
func (S *Lexer) getChar() (ch int, w int) {
	ch, w = int(S.input[S.readOffset]), 1
	switch {
	case ch == 0:
		S.error("illegal 0")
	case ch >= 0x80:
		ch, w = utf8.DecodeRune(S.input[S.readOffset:])
		if ch == utf8.RuneError && w == 1 {
			S.error("illegal utf")
Exemple #23
func skipIdent(file []byte, cursor int) int {
	for {
		letter, _ := utf8.DecodeRune(file[cursor:])
		if !isIdent(letter) {
			return cursor
		cursor = utf8MoveBackwards(file, cursor)
		if cursor <= 0 {
			return 0
	return 0
Exemple #24
Capitalizes the first character of the value.



If value is "neste", the output will be "Neste".
func CapFirstFormatter(w io.Writer, formatter string, data ...interface{}) {
	b := getBytes(data...)

	if len(b) > 0 {
		rune, size := utf8.DecodeRune(b)
		rune = unicode.ToUpper(rune)
		capSize := utf8.RuneLen(rune)
		capb := make([]byte, capSize)
		utf8.EncodeRune(capb, rune)
Exemple #25
// Parses the next rune and checks to see if its in a given range
func pRange(ranges []unicode.Range, result *string, src []byte, i *int) bool {
	rune, size := utf8.DecodeRune(src[i : i+utf8.UTF8Max])
	if unicode.Is(ranges, rune) {
		buf := make([]byte, size)
		utf8.EncodeRune(rune, buf)
		*result = string(buf) // return resulting rune
		*i += size            // Update index
		//src = src[size:len(src)]; // Update slice
		return true
	// No match
	return false
Exemple #26
// Find matches in slice b if b is non-nil, otherwise find matches in string s.
func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
	var end int
	if b == nil {
		end = len(s)
	} else {
		end = len(b)

	for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
		var in input
		if b == nil {
			in = newInputString(s)
		} else {
			in = newInputBytes(b)
		matches := re.doExecute(in, pos, re.prog.NumCap)
		if len(matches) == 0 {

		accept := true
		if matches[1] == pos {
			// We've found an empty match.
			if matches[0] == prevMatchEnd {
				// We don't allow an empty match right
				// after a previous match, so ignore it.
				accept = false
			var width int
			// TODO: use step()
			if b == nil {
				_, width = utf8.DecodeRuneInString(s[pos:end])
			} else {
				_, width = utf8.DecodeRune(b[pos:end])
			if width > 0 {
				pos += width
			} else {
				pos = end + 1
		} else {
			pos = matches[1]
		prevMatchEnd = matches[1]

		if accept {
Exemple #27
// Specialized function for TeX-style hyphenation patterns.  Accepts strings of the form '.hy2p'.
// The value it stores is of type vector.IntVector
func (p *Trie) AddPatternString(s string) {
	v := new(vector.IntVector)

	// precompute the Unicode rune for the character '0'
	rune0, _ := utf8.DecodeRune([]byte{'0'})

	strLen := len(s)

	// Using the range keyword will give us each Unicode rune.
	for pos, rune := range s {
		if unicode.IsDigit(rune) {
			if pos == 0 {
				// This is a prefix number
				v.Push(rune - rune0)

			// this is a number referring to the previous character, and has
			// already been handled

		if pos < strLen-1 {
			// look ahead to see if it's followed by a number
			next := int(s[pos+1])
			if unicode.IsDigit(next) {
				// next char is the hyphenation value for this char
				v.Push(next - rune0)
			} else {
				// hyphenation for this char is an implied zero
		} else {
			// last character gets an implied zero

	pure := strings.Map(func(rune int) int {
		if unicode.IsDigit(rune) {
			return -1
		return rune
	leaf := p.addRunes(strings.NewReader(pure))
	if leaf == nil {

	leaf.value = v
Exemple #28
// ReadRune reads a single UTF-8 encoded Unicode character and returns the
// rune and its size in bytes.
func (b *Reader) ReadRune() (rune int, size int, err os.Error) {
	for b.r+utf8.UTFMax > b.w && !utf8.FullRune(b.buf[b.r:b.w]) && b.err == nil {
	if b.r == b.w {
		return 0, 0, b.err
	rune, size = int(b.buf[b.r]), 1
	if rune >= 0x80 {
		rune, size = utf8.DecodeRune(b.buf[b.r:b.w])
	b.r += size
	b.lastbyte = int(b.buf[b.r-1])
	return rune, size, nil
// indexFunc is the same as IndexFunc except that if
// truth==false, the sense of the predicate function is
// inverted.
func indexFunc(s []byte, f func(r int) bool, truth bool) int {
	start := 0
	for start < len(s) {
		wid := 1
		rune := int(s[start])
		if rune >= utf8.RuneSelf {
			rune, wid = utf8.DecodeRune(s[start:])
		if f(rune) == truth {
			return start
		start += wid
	return -1
Exemple #30
// ReadRune reads and returns the next UTF-8-encoded
// Unicode code point from the buffer.
// If no bytes are available, the error returned is os.EOF.
// If the bytes are an erroneous UTF-8 encoding, it
// consumes one byte and returns U+FFFD, 1.
func (b *Buffer) ReadRune() (r int, size int, err os.Error) {
	if b.off >= len(b.buf) {
		// Buffer is empty, reset to recover space.
		return 0, 0, os.EOF
	c := b.buf[b.off]
	if c < utf8.RuneSelf {
		return int(c), 1, nil
	r, n := utf8.DecodeRune(b.buf[b.off:])
	b.off += n
	return r, n, nil