// Trim returns a slice of the string s, with all leading and trailing white space // removed, as defined by Unicode. func TrimSpace(s []byte) []byte { start, end := 0, len(s) for start < end { wid := 1 rune := int(s[start]) if rune >= utf8.RuneSelf { rune, wid = utf8.DecodeRune(s[start:end]) } if !unicode.IsSpace(rune) { break } start += wid } for start < end { wid := 1 rune := int(s[end-1]) if rune >= utf8.RuneSelf { // Back up carefully looking for beginning of rune. Mustn't pass start. for wid = 2; start <= end-wid && !utf8.RuneStart(s[end-wid]); wid++ { } if start > end-wid { // invalid UTF-8 sequence; stop processing return s[start:end] } rune, wid = utf8.DecodeRune(s[end-wid : end]) } if !unicode.IsSpace(rune) { break } end -= wid } return s[start:end] }
// FieldsFunc interprets s as a sequence of UTF-8-encoded Unicode code points. // It splits the array s at each run of code points c satisfying f(c) and // returns a slice of subarrays of s. If no code points in s satisfy f(c), an // empty slice is returned. func FieldsFunc(s []byte, f func(int) bool) [][]byte { n := 0 inField := false for i := 0; i < len(s); { rune, size := utf8.DecodeRune(s[i:]) wasInField := inField inField = !f(rune) if inField && !wasInField { n++ } i += size } a := make([][]byte, n) na := 0 fieldStart := -1 for i := 0; i <= len(s) && na < n; { rune, size := utf8.DecodeRune(s[i:]) if fieldStart < 0 && size > 0 && !f(rune) { fieldStart = i i += size continue } if fieldStart >= 0 && (size == 0 || f(rune)) { a[na] = s[fieldStart:i] na++ fieldStart = -1 } if size == 0 { break } i += size } return a[0:na] }
func (self *scanner) nextWord() (word tok, err os.Error) { if self.index >= len(self.content) { err = os.NewError("EOF") return } for self.index < len(self.content) { r, l := utf8.DecodeRune(self.content[self.index:]) if !unicode.IsSpace(r) || r == '\n' { break } self.index += l } j, ttype, inchar, incode := self.index, other, false, 0 for self.index < len(self.content) { r, l := utf8.DecodeRune(self.content[self.index:]) if r == '\'' { inchar = !inchar } if self.index == j { switch { case unicode.IsUpper(r): ttype = nonterm case r == '\n': self.index++ ttype = newline break case r == ':': ttype = begindef case r == ';': ttype = enddef case r == '|': ttype = alternate case r == '{' && memorizeTerms: incode++ ttype = code default: ttype = term } } else if incode > 0 && r == '{' { incode++ } else if incode > 0 && r == '}' { incode-- } if incode == 0 && !inchar && unicode.IsSpace(r) { break } self.index += l } token := string(self.content[j:self.index]) if ttype == newline { token = "" } word = tok{token, ttype} return }
// EqualFold reports whether s and t, interpreted as UTF-8 strings, // are equal under Unicode case-folding. func EqualFold(s, t []byte) bool { for len(s) != 0 && len(t) != 0 { // Extract first rune from each. var sr, tr rune if s[0] < utf8.RuneSelf { sr, s = rune(s[0]), s[1:] } else { r, size := utf8.DecodeRune(s) sr, s = r, s[size:] } if t[0] < utf8.RuneSelf { tr, t = rune(t[0]), t[1:] } else { r, size := utf8.DecodeRune(t) tr, t = r, t[size:] } // If they match, keep going; if not, return false. // Easy case. if tr == sr { continue } // Make sr < tr to simplify what follows. if tr < sr { tr, sr = sr, tr } // Fast check for ASCII. if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' { // ASCII, and sr is upper case. tr must be lower case. if tr == sr+'a'-'A' { continue } return false } // General case. SimpleFold(x) returns the next equivalent rune > x // or wraps around to smaller values. r := unicode.SimpleFold(sr) for r != sr && r < tr { r = unicode.SimpleFold(r) } if r == tr { continue } return false } // One string is empty. Are both? return len(s) == len(t) }
func (s inputBytes) hangul(p int) uint32 { if !isHangul(s[p:]) { return 0 } rune, _ := utf8.DecodeRune(s[p:]) return uint32(rune) }
// insert inserts the given rune in the buffer ordered by CCC. // It returns true if the buffer was large enough to hold the decomposed rune. func (rb *reorderBuffer) insert(src []byte, info runeInfo) bool { if info.size == 3 && isHangul(src) { rune, _ := utf8.DecodeRune(src) return rb.decomposeHangul(uint32(rune)) } if info.flags.hasDecomposition() { dcomp := rb.f.decompose(src) for i := 0; i < len(dcomp); { info = rb.f.info(dcomp[i:]) pos := rb.nbyte if !rb.insertOrdered(info) { return false } end := i + int(info.size) copy(rb.byte[pos:], dcomp[i:end]) i = end } } else { pos := rb.nbyte if !rb.insertOrdered(info) { return false } copy(rb.byte[pos:], src[:info.size]) } return true }
func _peek_char(port Obj) Obj { if is_immediate(port) { panic("bad type") } switch v := (*port).(type) { case *InputPort: if v.is_binary { panic("bad port type") } for !utf8.FullRune(v.lookahead[0:v.lookahead_valid]) { n, err := io.ReadFull(v.r, v.lookahead[v.lookahead_valid:v.lookahead_valid+1]) v.lookahead_valid += n switch { case err == os.EOF: return Eof case err != nil: panic("I/O read error") } } cp, _ := utf8.DecodeRune(v.lookahead[0:v.lookahead_valid]) return Make_char(cp) } panic("bad type") }
// Read the next Unicode char into S.ch. // S.ch < 0 means end-of-file. // func (S *Scanner) next() { if S.rdOffset < len(S.src) { S.offset = S.rdOffset if S.ch == '\n' { S.lineOffset = S.offset S.file.AddLine(S.offset) } r, w := int(S.src[S.rdOffset]), 1 switch { case r == 0: S.error(S.offset, "illegal character NUL") case r >= 0x80: // not ASCII r, w = utf8.DecodeRune(S.src[S.rdOffset:]) if r == utf8.RuneError && w == 1 { S.error(S.offset, "illegal UTF-8 encoding") } } S.rdOffset += w S.ch = r } else { S.offset = len(S.src) if S.ch == '\n' { S.lineOffset = S.offset S.file.AddLine(S.offset) } S.ch = -1 // eof } }
// Map returns a copy of the byte array s with all its characters modified // according to the mapping function. If mapping returns a negative value, the character is // dropped from the string with no replacement. The characters in s and the // output are interpreted as UTF-8-encoded Unicode code points. func Map(mapping func(rune int) int, s []byte) []byte { // In the worst case, the array can grow when mapped, making // things unpleasant. But it's so rare we barge in assuming it's // fine. It could also shrink but that falls out naturally. maxbytes := len(s) // length of b nbytes := 0 // number of bytes encoded in b b := make([]byte, maxbytes) for i := 0; i < len(s); { wid := 1 rune := int(s[i]) if rune >= utf8.RuneSelf { rune, wid = utf8.DecodeRune(s[i:]) } rune = mapping(rune) if rune >= 0 { if nbytes+utf8.RuneLen(rune) > maxbytes { // Grow the buffer. maxbytes = maxbytes*2 + utf8.UTFMax nb := make([]byte, maxbytes) copy(nb, b[0:nbytes]) b = nb } nbytes += utf8.EncodeRune(b[nbytes:maxbytes], rune) } i += wid } return b[0:nbytes] }
func (c *AutoCompleteContext) deduceDecl(file []byte, cursor int) *DeclApropos { orig := cursor if cursor < 0 { return nil } if cursor == 0 { return &DeclApropos{nil, ""} } // figure out what is just before the cursor cursor = utf8MoveBackwards(file, cursor) if file[cursor] == '.' { // we're '<whatever>.' // figure out decl, Parital is "" return c.deduceExpr(file[:cursor], "") } else { letter, _ := utf8.DecodeRune(file[cursor:]) if isIdent(letter) { // we're '<whatever>.<ident>' // parse <ident> as Partial and figure out decl cursor = skipIdent(file, cursor) partial := string(file[cursor+1 : orig]) if file[cursor] == '.' { return c.deduceExpr(file[:cursor], partial) } else { return &DeclApropos{nil, partial} } } } return &DeclApropos{nil, ""} }
// Read the next Unicode char into S.ch. // S.ch < 0 means end-of-file. // func (S *Scanner) next() { if S.offset < len(S.src) { S.pos.Offset = S.offset S.pos.Column++ if S.ch == '\n' { // next character starts a new line S.pos.Line++ S.pos.Column = 1 } r, w := int(S.src[S.offset]), 1 switch { case r == 0: S.error(S.pos, "illegal character NUL") case r >= 0x80: // not ASCII r, w = utf8.DecodeRune(S.src[S.offset:]) if r == utf8.RuneError && w == 1 { S.error(S.pos, "illegal UTF-8 encoding") } } S.offset += w S.ch = r } else { S.pos.Offset = len(S.src) S.ch = -1 // eof } }
func findExpr(file []byte) []byte { const ( LAST_NONE = iota LAST_DOT LAST_PAREN LAST_IDENT ) last := LAST_NONE cursor := len(file) cursor = utf8MoveBackwards(file, cursor) loop: for { c := file[cursor] letter, _ := utf8.DecodeRune(file[cursor:]) switch c { case '.': cursor = utf8MoveBackwards(file, cursor) last = LAST_DOT case ')', ']': if last == LAST_IDENT { break loop } cursor = utf8MoveBackwards(file, skipToPair(file, cursor)) last = LAST_PAREN default: if isIdent(letter) { cursor = skipIdent(file, cursor) last = LAST_IDENT } else { break loop } } } return file[cursor+1:] }
// Replace returns a copy of the slice s with the first n // non-overlapping instances of old replaced by new. // If n < 0, there is no limit on the number of replacements. func Replace(s, old, new []byte, n int) []byte { if n == 0 { return s // avoid allocation } // Compute number of replacements. if m := Count(s, old); m == 0 { return s // avoid allocation } else if n <= 0 || m < n { n = m } // Apply replacements to buffer. t := make([]byte, len(s)+n*(len(new)-len(old))) w := 0 start := 0 for i := 0; i < n; i++ { j := start if len(old) == 0 { if i > 0 { _, wid := utf8.DecodeRune(s[start:]) j += wid } } else { j += Index(s[start:], old) } w += copy(t[w:], s[start:j]) w += copy(t[w:], new) start = j + len(old) } w += copy(t[w:], s[start:]) return t[0:w] }
// ReadRune returns the next UTF-8 encoded code point from the // io.Reader inside r. func (r *readRune) ReadRune() (rune int, size int, err os.Error) { r.buf[0], err = r.readByte() if err != nil { return 0, 0, err } if r.buf[0] < utf8.RuneSelf { // fast check for common ASCII case rune = int(r.buf[0]) return } var n int for n = 1; !utf8.FullRune(r.buf[0:n]); n++ { r.buf[n], err = r.readByte() if err != nil { if err == os.EOF { err = nil break } return } } rune, size = utf8.DecodeRune(r.buf[0:n]) if size < n { // an error r.unread(r.buf[size:n]) } return }
// next reads and returns the next Unicode character. It is designed such // that only a minimal amount of work needs to be done in the common ASCII // case (one test to check for both ASCII and end-of-buffer, and one test // to check for newlines). func (s *Scanner) next() int { ch := int(s.srcBuf[s.srcPos]) if ch >= utf8.RuneSelf { // uncommon case: not ASCII or not enough bytes for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) { // not enough bytes: read some more, but first // save away token text if any if s.tokPos >= 0 { s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos]) s.tokPos = 0 } // move unread bytes to beginning of buffer copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd]) s.srcBufOffset += s.srcPos // read more bytes i := s.srcEnd - s.srcPos n, err := s.src.Read(s.srcBuf[i:bufLen]) s.srcEnd = i + n s.srcPos = 0 s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel if err != nil { if s.srcEnd == 0 { return EOF } if err != os.EOF { s.error(err.String()) break } } } // at least one byte ch = int(s.srcBuf[s.srcPos]) if ch >= utf8.RuneSelf { // uncommon case: not ASCII var width int ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd]) if ch == utf8.RuneError && width == 1 { s.error("illegal UTF-8 encoding") } s.srcPos += width - 1 } } s.srcPos++ s.column++ switch ch { case 0: // implementation restriction for compatibility with other tools s.error("illegal character NUL") case '\n': s.line++ s.column = 0 } return ch }
func (i *inputBytes) step(pos int) (rune, int) { if pos < len(i.str) { c := i.str[pos] if c < utf8.RuneSelf { return rune(c), 1 } return utf8.DecodeRune(i.str[pos:]) } return endOfText, 0 }
func (i *inputBytes) context(pos int) syntax.EmptyOp { r1, r2 := endOfText, endOfText if pos > 0 && pos <= len(i.str) { r1, _ = utf8.DecodeLastRune(i.str[:pos]) } if pos < len(i.str) { r2, _ = utf8.DecodeRune(i.str[pos:]) } return syntax.EmptyOpContext(r1, r2) }
// TrimRightFunc returns a subslice of s by slicing off all trailing UTF-8 // encoded Unicode code points c that satisfy f(c). func TrimRightFunc(s []byte, f func(r int) bool) []byte { i := lastIndexFunc(s, f, false) if i >= 0 && s[i] >= utf8.RuneSelf { _, wid := utf8.DecodeRune(s[i:]) i += wid } else { i++ } return s[0:i] }
// IndexRune interprets s as a sequence of UTF-8-encoded Unicode code points. // It returns the byte index of the first occurrence in s of the given rune. // It returns -1 if rune is not present in s. func IndexRune(s []byte, rune int) int { for i := 0; i < len(s); { r, size := utf8.DecodeRune(s[i:]) if r == rune { return i } i += size } return -1 }
// Runes returns a slice of runes (Unicode code points) equivalent to s. func Runes(s []byte) []int { t := make([]int, utf8.RuneCount(s)) i := 0 for len(s) > 0 { r, l := utf8.DecodeRune(s) t[i] = r i++ s = s[l:] } return t }
func main() { var chars [6]int chars[0] = 'a' chars[1] = 'b' chars[2] = 'c' chars[3] = '\u65e5' chars[4] = '\u672c' chars[5] = '\u8a9e' s := "" for i := 0; i < 6; i++ { s += string(chars[i]) } var l = len(s) for w, i, j := 0, 0, 0; i < l; i += w { var r int r, w = utf8.DecodeRuneInString(s[i:len(s)]) if w == 0 { panic("zero width in string") } if r != chars[j] { panic("wrong value from string") } j++ } // encoded as bytes: 'a' 'b' 'c' e6 97 a5 e6 9c ac e8 aa 9e const L = 12 if L != l { panic("wrong length constructing array") } a := make([]byte, L) a[0] = 'a' a[1] = 'b' a[2] = 'c' a[3] = 0xe6 a[4] = 0x97 a[5] = 0xa5 a[6] = 0xe6 a[7] = 0x9c a[8] = 0xac a[9] = 0xe8 a[10] = 0xaa a[11] = 0x9e for w, i, j := 0, 0, 0; i < L; i += w { var r int r, w = utf8.DecodeRune(a[i:L]) if w == 0 { panic("zero width in bytes") } if r != chars[j] { panic("wrong value from bytes") } j++ } }
func (S *Lexer) getChar() (ch int, w int) { ch, w = int(S.input[S.readOffset]), 1 switch { case ch == 0: S.error("illegal 0") case ch >= 0x80: ch, w = utf8.DecodeRune(S.input[S.readOffset:]) if ch == utf8.RuneError && w == 1 { S.error("illegal utf") } } return }
func skipIdent(file []byte, cursor int) int { for { letter, _ := utf8.DecodeRune(file[cursor:]) if !isIdent(letter) { return cursor } cursor = utf8MoveBackwards(file, cursor) if cursor <= 0 { return 0 } } return 0 }
/* Capitalizes the first character of the value. Example: {value|capfirst} If value is "neste", the output will be "Neste". */ func CapFirstFormatter(w io.Writer, formatter string, data ...interface{}) { b := getBytes(data...) if len(b) > 0 { rune, size := utf8.DecodeRune(b) rune = unicode.ToUpper(rune) capSize := utf8.RuneLen(rune) capb := make([]byte, capSize) utf8.EncodeRune(capb, rune) w.Write(capb) w.Write(b[size:]) } }
// Parses the next rune and checks to see if its in a given range func pRange(ranges []unicode.Range, result *string, src []byte, i *int) bool { rune, size := utf8.DecodeRune(src[i : i+utf8.UTF8Max]) if unicode.Is(ranges, rune) { buf := make([]byte, size) utf8.EncodeRune(rune, buf) *result = string(buf) // return resulting rune *i += size // Update index //src = src[size:len(src)]; // Update slice return true } // No match return false }
// Find matches in slice b if b is non-nil, otherwise find matches in string s. func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { var end int if b == nil { end = len(s) } else { end = len(b) } for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { var in input if b == nil { in = newInputString(s) } else { in = newInputBytes(b) } matches := re.doExecute(in, pos, re.prog.NumCap) if len(matches) == 0 { break } accept := true if matches[1] == pos { // We've found an empty match. if matches[0] == prevMatchEnd { // We don't allow an empty match right // after a previous match, so ignore it. accept = false } var width int // TODO: use step() if b == nil { _, width = utf8.DecodeRuneInString(s[pos:end]) } else { _, width = utf8.DecodeRune(b[pos:end]) } if width > 0 { pos += width } else { pos = end + 1 } } else { pos = matches[1] } prevMatchEnd = matches[1] if accept { deliver(re.pad(matches)) i++ } } }
// Specialized function for TeX-style hyphenation patterns. Accepts strings of the form '.hy2p'. // The value it stores is of type vector.IntVector func (p *Trie) AddPatternString(s string) { v := new(vector.IntVector) // precompute the Unicode rune for the character '0' rune0, _ := utf8.DecodeRune([]byte{'0'}) strLen := len(s) // Using the range keyword will give us each Unicode rune. for pos, rune := range s { if unicode.IsDigit(rune) { if pos == 0 { // This is a prefix number v.Push(rune - rune0) } // this is a number referring to the previous character, and has // already been handled continue } if pos < strLen-1 { // look ahead to see if it's followed by a number next := int(s[pos+1]) if unicode.IsDigit(next) { // next char is the hyphenation value for this char v.Push(next - rune0) } else { // hyphenation for this char is an implied zero v.Push(0) } } else { // last character gets an implied zero v.Push(0) } } pure := strings.Map(func(rune int) int { if unicode.IsDigit(rune) { return -1 } return rune }, s) leaf := p.addRunes(strings.NewReader(pure)) if leaf == nil { return } leaf.value = v }
// ReadRune reads a single UTF-8 encoded Unicode character and returns the // rune and its size in bytes. func (b *Reader) ReadRune() (rune int, size int, err os.Error) { for b.r+utf8.UTFMax > b.w && !utf8.FullRune(b.buf[b.r:b.w]) && b.err == nil { b.fill() } if b.r == b.w { return 0, 0, b.err } rune, size = int(b.buf[b.r]), 1 if rune >= 0x80 { rune, size = utf8.DecodeRune(b.buf[b.r:b.w]) } b.r += size b.lastbyte = int(b.buf[b.r-1]) return rune, size, nil }
// indexFunc is the same as IndexFunc except that if // truth==false, the sense of the predicate function is // inverted. func indexFunc(s []byte, f func(r int) bool, truth bool) int { start := 0 for start < len(s) { wid := 1 rune := int(s[start]) if rune >= utf8.RuneSelf { rune, wid = utf8.DecodeRune(s[start:]) } if f(rune) == truth { return start } start += wid } return -1 }
// ReadRune reads and returns the next UTF-8-encoded // Unicode code point from the buffer. // If no bytes are available, the error returned is os.EOF. // If the bytes are an erroneous UTF-8 encoding, it // consumes one byte and returns U+FFFD, 1. func (b *Buffer) ReadRune() (r int, size int, err os.Error) { if b.off >= len(b.buf) { // Buffer is empty, reset to recover space. b.Truncate(0) return 0, 0, os.EOF } c := b.buf[b.off] if c < utf8.RuneSelf { b.off++ return int(c), 1, nil } r, n := utf8.DecodeRune(b.buf[b.off:]) b.off += n return r, n, nil }