// FieldsFunc interprets s as a sequence of UTF-8-encoded Unicode code points. // It splits the array s at each run of code points c satisfying f(c) and // returns a slice of subarrays of s. If no code points in s satisfy f(c), an // empty slice is returned. func FieldsFunc(s []byte, f func(rune) bool) [][]byte { n := 0 inField := false for i := 0; i < len(s); { r, size := utf8.DecodeRune(s[i:]) wasInField := inField inField = !f(r) if inField && !wasInField { n++ } i += size } a := make([][]byte, n) na := 0 fieldStart := -1 for i := 0; i <= len(s) && na < n; { r, size := utf8.DecodeRune(s[i:]) if fieldStart < 0 && size > 0 && !f(r) { fieldStart = i i += size continue } if fieldStart >= 0 && (size == 0 || f(r)) { a[na] = s[fieldStart:i] na++ fieldStart = -1 } if size == 0 { break } i += size } return a[0:na] }
func (t *Trie) Add(name string, data []int) { nm := []byte(name) if t.head == nil { t.init() } cur := t.head i := 0 for i < len(nm) { r, size := utf8.DecodeRune(nm[i:]) if _, ok := cur.children[r]; ok { cur = cur.children[r] i += size } else { break } } for i < len(nm) { r, size := utf8.DecodeRune(nm[i:]) if i+size <= len(nm) { if _, ok := cur.children[r]; !ok && cur.name != name { cur.children[r] = &node{name[:i+size], nil, make(map[rune]*node)} cur = cur.children[r] i += size } } } cur.val = data }
func scanStmts(data []byte, atEOF bool) (advance int, token []byte, err error) { // Skip leading spaces. start := 0 for width := 0; start < len(data); start += width { var r rune r, width = utf8.DecodeRune(data[start:]) if !unicode.IsSpace(r) { break } } if atEOF && len(data) == 0 { return 0, nil, nil } end := start // Scan until semicolon, marking end of statement. for width, i := 0, start; i < len(data); i += width { var r rune r, width = utf8.DecodeRune(data[i:]) if r == ';' { return i + width, data[start:i], nil } else if !unicode.IsSpace(r) { end = i + 1 } } // If we're at EOF, we have a final, non-empty, non-terminated statement. Return it. if atEOF && len(data) > start { return len(data), data[start:end], nil } // Request more data. return 0, nil, nil }
// bufio.Scanner function to split data by words and quoted strings func scanStrings(data []byte, atEOF bool) (advance int, token []byte, err error) { // Skip leading spaces. start := 0 for width := 0; start < len(data); start += width { var r rune r, width = utf8.DecodeRune(data[start:]) if !unicode.IsSpace(r) { break } } if atEOF && len(data) == 0 { return 0, nil, nil } // Scan until space, marking end of word. inquote := false for width, i := 0, start; i < len(data); i += width { var r rune r, width = utf8.DecodeRune(data[i:]) if r == '"' { inquote = !inquote continue } if unicode.IsSpace(r) && !inquote { return i + width, data[start:i], nil } } // If we're at EOF, we have a final, non-empty, non-terminated word. Return it. if atEOF && len(data) > start { return len(data), data[start:], nil } // Request more data. return 0, nil, nil }
// scanWordsKeepPrefix is a split function for a Scanner that returns each // space-separated word of text, with prefixing spaces included. It will never // return an empty string. The definition of space is set by unicode.IsSpace. // // Adapted from bufio.ScanWords(). func scanTokensKeepPrefix(data []byte, atEOF bool) (advance int, token []byte, err error) { // Skip leading spaces. start := 0 for width := 0; start < len(data); start += width { var r rune r, width = utf8.DecodeRune(data[start:]) if !unicode.IsSpace(r) { break } } if atEOF && len(data) == 0 || start == len(data) { return len(data), data, nil } if len(data) > start && data[start] == '#' { return scanLinesKeepPrefix(data, atEOF) } // Scan until space, marking end of word. for width, i := 0, start; i < len(data); i += width { var r rune r, width = utf8.DecodeRune(data[i:]) if unicode.IsSpace(r) { return i, data[:i], nil } } // If we're at EOF, we have a final, non-empty, non-terminated word. Return it. if atEOF && len(data) > start { return len(data), data, nil } // Request more data. return 0, nil, nil }
// scanWords is a split function for a Scanner that returns each // space-separated word of text, with surrounding spaces deleted. It will // never return an empty string. The definition of space is set by // unicode.IsSpace. func scanWords(data []byte, atEOF bool) (advance int, token []byte, err error) { // Skip leading spaces. start := 0 for width := 0; start < len(data); start += width { var r rune r, width = utf8.DecodeRune(data[start:]) if !unicode.IsSpace(r) { break } } quote := false // Scan until space, marking end of word. for width, i := 0, start; i < len(data); i += width { var r rune r, width = utf8.DecodeRune(data[i:]) switch { case i == 0 && r == '"': quote = true case !quote && unicode.IsSpace(r): return i + width, data[start:i], nil case quote && r == '"': return i + width, data[start+width : i], nil } } // If we're at EOF, we have a final, non-empty, non-terminated word. Return it. if atEOF && len(data) > start { return len(data), data[start:], nil } // Request more data. return start, nil, nil }
func iter_words(data []byte, cb func(word []byte)) { for { if len(data) == 0 { return } r, rlen := utf8.DecodeRune(data) // skip non-word runes for !is_word(r) { data = data[rlen:] if len(data) == 0 { return } r, rlen = utf8.DecodeRune(data) } // must be on a word rune i := 0 for is_word(r) && i < len(data) { i += rlen r, rlen = utf8.DecodeRune(data[i:]) } cb(data[:i]) data = data[i:] } }
// pat = EscapeNormalString(pat) func EscapeNormalString(in string) (rv string) { rv = "" var c rune var sz int for i := 0; i < len(in); i += sz { c, sz = utf8.DecodeRune([]byte(in[i:])) if c == '\\' { i += sz c, sz = utf8.DecodeRune([]byte(in[i:])) switch c { case 'n': rv += "\n" case 't': rv += "\t" case 'f': rv += "\f" case 'r': rv += "\r" case 'v': rv += "\v" default: rv += string(c) } } else { rv += string(c) } } return }
// ScanWords is a split function for a Scanner that returns each // space-separated word of text, with surrounding spaces deleted. It will // never return an empty string. The definition of space is set by // unicode.IsSpace. func ScanWords(data []byte, atEOF bool) (advance int, token []byte, err error) { // Skip leading spaces. start := 0 for width := 0; start < len(data); start += width { var r rune r, width = utf8.DecodeRune(data[start:]) if !isSpace(r) { break } } // Scan until space, marking end of word. for width, i := 0, start; i < len(data); i += width { var r rune r, width = utf8.DecodeRune(data[i:]) if isSpace(r) { return i + width, data[start:i], nil } } // If we're at EOF, we have a final, non-empty, non-terminated word. Return it. if atEOF && len(data) > start { return len(data), data[start:], nil } // Request more data. return start, nil, nil }
func (self *WordDict) AddReplaceRule(rule []byte) { if utf8.RuneCount(rule) != 2 { self.Panic("rule format differs from '=xX'") } from, fromSize := utf8.DecodeRune(rule) to, _ := utf8.DecodeRune(rule[fromSize:]) self.runeMapping[from] = to }
func init() { // setup the required runes colon, _ = utf8.DecodeRune([]byte(":")) dash, _ = utf8.DecodeRune([]byte("-")) period, _ = utf8.DecodeRune([]byte(".")) slash, _ = utf8.DecodeRune([]byte("/")) underscore, _ = utf8.DecodeRune([]byte("_")) }
func (lr *lexlReader) ScanRune(read bool) (rune, error) { fmt.Println("SCAN RUNE") if lr.size < 4 { fmt.Println(" < 4 fill") err := lr.attemptFill() if err != nil { fmt.Println("SCAN ERR") return 0, err } fmt.Println(" < 4 fill done") } fmt.Printf("lr.size: %d\n", lr.size) if lr.size == 0 { return 0, io.EOF } if len(lr.buf)-lr.pos < 4 { fmt.Println("END BUFFERING") nbuf := make([]byte, 4) nlen := 4 if nlen > lr.size { nlen = lr.size } npos := lr.pos for i := 0; i < nlen; i++ { nbuf[i] = lr.buf[npos] npos++ if npos >= len(lr.buf) { npos -= len(lr.buf) } } r, ns := utf8.DecodeRune(nbuf) if r == utf8.RuneError { return 0, errors.New("stream does not decode a utf-8 character") } if read { lr.pos += ns lr.size -= ns if lr.pos >= len(lr.buf) { lr.pos -= len(lr.buf) } } return r, nil } fmt.Println("DECODING FROM BUFFER") r, ns := utf8.DecodeRune(lr.buf[lr.pos:]) if r == utf8.RuneError { return 0, errors.New("stream does not decode a utf-8 character") } if read { lr.pos += ns lr.size -= ns if lr.pos >= len(lr.buf) { lr.pos -= len(lr.buf) } } return r, nil }
func CompareChars(word string) { s := []byte(word) for utf8.RuneCount(s) > 1 { r, size := utf8.DecodeRune(s) s = s[size:] nextR, size := utf8.DecodeRune(s) fmt.Print(r == nextR, ",") } fmt.Println() }
func NewInput(in io.Reader) *Input { input := &Input{ "", bufio.NewScanner(in), } split := func(data []byte, atEOF bool) (advance int, token []byte, err error) { //fmt.Println("Input:", string(data)) // Skip leading spaces. start := 0 for width := 0; start < len(data); start += width { var r rune r, width = utf8.DecodeRune(data[start:]) if !IsWhitespace(r) { break } } if atEOF && len(data[start:]) == 0 { //fmt.Println("need more data 1") return 0, nil, nil } //fmt.Println("After WS Skip:", string(data[start:])) var r rune var width int r, width = utf8.DecodeRune(data[start:]) if r == '(' || r == ')' { //fmt.Println("returning token:", string(data[start:start+width])) return start + width, data[start : start+width], nil } //fmt.Println("After paren check:", string(data[start:])) // Scan until space, marking end of word. for width, i := 0, start; i < len(data); i += width { r, width = utf8.DecodeRune(data[i:]) //fmt.Printf("rune %d: %s\n", i, string(r)) if IsWhitespace(r) || r == '(' || r == ')' { //fmt.Println("returning token:", string(data[start:i])) return i, data[start:i], nil } } // If we're at EOF, we have a final, non-empty, non-terminated word. Return it. if atEOF && len(data) > start { //fmt.Println("returning token:", string(data[start:])) return len(data) - start, data[start:], nil } // Request more data. //fmt.Println("need more data 2") return 0, nil, nil } input.Split(split) return input }
func (src *Src) Consume(match ConsumeFunc) string { buf := src.Bytes() var m int for r, n := utf8.DecodeRune(buf); r != utf8.RuneError; r, n = utf8.DecodeRune(buf) { if !match(r) { break } buf = buf[n:] m += n } return src.SkipString(m) }
// EqualFold reports whether s and t, interpreted as UTF-8 strings, // are equal under Unicode case-folding. func EqualFold(s, t []byte) bool { for len(s) != 0 && len(t) != 0 { // Extract first rune from each. var sr, tr rune if s[0] < utf8.RuneSelf { sr, s = rune(s[0]), s[1:] } else { r, size := utf8.DecodeRune(s) sr, s = r, s[size:] } if t[0] < utf8.RuneSelf { tr, t = rune(t[0]), t[1:] } else { r, size := utf8.DecodeRune(t) tr, t = r, t[size:] } // If they match, keep going; if not, return false. // Easy case. if tr == sr { continue } // Make sr < tr to simplify what follows. if tr < sr { tr, sr = sr, tr } // Fast check for ASCII. if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' { // ASCII, and sr is upper case. tr must be lower case. if tr == sr+'a'-'A' { continue } return false } // General case. SimpleFold(x) returns the next equivalent rune > x // or wraps around to smaller values. r := unicode.SimpleFold(sr) for r != sr && r < tr { r = unicode.SimpleFold(r) } if r == tr { continue } return false } // One string is empty. Are both? return len(s) == len(t) }
// Creates a scanner that splits on words or quoted strings func NewQuotedScanner(r io.Reader) *bufio.Scanner { scanner := bufio.NewScanner(r) split := func(data []byte, atEOF bool) (advance int, token []byte, err error) { // Skip leading spaces. start := 0 for width := 0; start < len(data); start += width { var r rune r, width = utf8.DecodeRune(data[start:]) if !unicode.IsSpace(r) { break } } // Does word start with a quote? quote, width := utf8.DecodeRune(data[start:]) i := start if IsQuote(quote) { log.Debugf("Quote detected '%c'", quote) i = i + width } else { quote = 0 } // Scan until space, marking end of word. for width := 0; i < len(data); i += width { var r rune r, width = utf8.DecodeRune(data[i:]) if quote == 0 { if unicode.IsSpace(r) { return i + width, data[start:i], nil } } else { // Look for ending quote // BUG: need to implement escape handling if r == quote { log.Debugf("Found end quote %d chars after start", i) quote = 0 } } } // If we're at EOF, we have a final, non-empty, non-terminated word. Return it. if atEOF && len(data) > start { return len(data), data[start:], nil } // Request more data. return start, nil, nil } scanner.Split(split) return scanner }
func (f *commaSeparated) Open(r io.Reader) error { f.reader = r f.csvReader = csv.NewReader(r) if f.FieldDelim != "" { f.csvReader.Comma, _ = utf8.DecodeRune([]byte(f.FieldDelim)) } if f.Comment != "" { f.csvReader.Comment, _ = utf8.DecodeRune([]byte(f.Comment)) } f.csvReader.FieldsPerRecord = f.NumFields return nil }
func findUnescaped(toFind, escape rune, data []byte, from int) int { for i := from; i < len(data); { r, sz := utf8.DecodeRune(data[i:]) i += sz if r == escape { // skip next char _, sz = utf8.DecodeRune(data[i:]) i += sz } else if r == toFind { return i - sz } } return -1 }
// splitFunc is a wrapper around bufio.SplitFunc that calculates line and // column information for tokens. func (t *Tokenizer) splitFunc() bufio.SplitFunc { line, column := 1, 1 lastLine, lastColumn := -1, -1 return func(data []byte, atEOF bool) (advance int, token []byte, err error) { // Skip leading non-characters. start := 0 for width := 0; start < len(data); start += width { var r rune r, width = utf8.DecodeRune(data[start:]) if t.tokenizerFunc(r) { break } if r == '\n' { line += 1 column = 1 } else { column += 1 } } if lastLine == -1 { lastLine, lastColumn = line, column } // Scan until first non-character, marking end of token. for width, i := 0, start; i < len(data); i += width { var r rune r, width = utf8.DecodeRune(data[i:]) if r == '\n' { line += 1 column = 1 } else { column += 1 } if !t.tokenizerFunc(r) { t.line, t.column = lastLine, lastColumn lastLine, lastColumn = -1, -1 return i + width, data[start:i], nil } } // If we're at EOF, we have a final, non-empty, non-terminated token. Return it. if atEOF && len(data) > start { t.line, t.column = lastLine, lastColumn lastLine, lastColumn = -1, -1 return len(data), data[start:], nil } // Request more data. return start, nil, nil } }
// Test that the rune splitter returns same sequence of runes (not bytes) as for range string. func TestScanRune(t *testing.T) { for n, test := range scanTests { buf := bytes.NewBufferString(test) s := NewScanner(buf) s.Split(ScanRunes) var i, runeCount int var expect rune // Use a string range loop to validate the sequence of runes. for i, expect = range string(test) { if !s.Scan() { break } runeCount++ got, _ := utf8.DecodeRune(s.Bytes()) if got != expect { t.Errorf("#%d: %d: expected %q got %q", n, i, expect, got) } } if s.Scan() { t.Errorf("#%d: scan ran too long, got %q", n, s.Text()) } testRuneCount := utf8.RuneCountInString(test) if runeCount != testRuneCount { t.Errorf("#%d: termination expected at %d; got %d", n, testRuneCount, runeCount) } err := s.Err() if err != nil { t.Errorf("#%d: %v", n, err) } } }
// ScanRunes is a split function for a Scanner that returns each // UTF-8-encoded rune as a token. The sequence of runes returned is // equivalent to that from a range loop over the input as a string, which // means that erroneous UTF-8 encodings translate to U+FFFD = "\xef\xbf\xbd". // Because of the Scan interface, this makes it impossible for the client to // distinguish correctly encoded replacement runes from encoding errors. func ScanRunes(data []byte, atEOF bool) (advance int, token []byte, err error) { if atEOF && len(data) == 0 { return 0, nil, nil } // Fast path 1: ASCII. if data[0] < utf8.RuneSelf { return 1, data[0:1], nil } // Fast path 2: Correct UTF-8 decode without error. _, width := utf8.DecodeRune(data) if width > 1 { // It's a valid encoding. Width cannot be one for a correctly encoded // non-ASCII rune. return width, data[0:width], nil } // We know it's an error: we have width==1 and implicitly r==utf8.RuneError. // Is the error because there wasn't a full rune to be decoded? // FullRune distinguishes correctly between erroneous and incomplete encodings. if !atEOF && !utf8.FullRune(data) { // Incomplete; get more bytes. return 0, nil, nil } // We have a real UTF-8 encoding error. Return a properly encoded error rune // but advance only one byte. This matches the behavior of a range loop over // an incorrectly encoded string. return 1, errorRune, nil }
// utf7enc converts string s from UTF-8 to UTF-16-BE, encodes the result as // Base64, removes the padding, and adds UTF-7 shifts. func utf7enc(s []byte) []byte { // len(s) is sufficient for UTF-8 to UTF-16 conversion if there are no // control code points (see table below). b := make([]byte, 0, len(s)+4) for len(s) > 0 { r, size := utf8.DecodeRune(s) if r > utf8.MaxRune { r, size = utf8.RuneError, 1 // Bug fix (issue 3785) } s = s[size:] if r1, r2 := utf16.EncodeRune(r); r1 != uRepl { b = append(b, byte(r1>>8), byte(r1)) r = r2 } b = append(b, byte(r>>8), byte(r)) } // Encode as Base64 n := u7enc.EncodedLen(len(b)) + 2 b64 := make([]byte, n) u7enc.Encode(b64[1:], b) // Strip padding n -= 2 - (len(b)+2)%3 b64 = b64[:n] // Add UTF-7 shifts b64[0] = '&' b64[n-1] = '-' return b64 }
// toLowerDeferredCopy will function exactly like // bytes.ToLower() only it will reuse (overwrite) // the original byte array when possible // NOTE: because its possible that the lower-case // form of a rune has a different utf-8 encoded // length, in these cases a new byte array is allocated func toLowerDeferredCopy(s []byte) []byte { j := 0 for i := 0; i < len(s); { wid := 1 r := rune(s[i]) if r >= utf8.RuneSelf { r, wid = utf8.DecodeRune(s[i:]) } l := unicode.ToLower(r) lwid := utf8.RuneLen(l) if lwid > wid { // utf-8 encoded replacement is wider // for now, punt and defer // to bytes.ToLower() for the remainder // only known to happen with chars // Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3 // Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3 rest := bytes.ToLower(s[i:]) rv := make([]byte, j+len(rest)) copy(rv[:j], s[:j]) copy(rv[j:], rest) return rv } else { utf8.EncodeRune(s[j:], l) } i += wid j += lwid } return s[:j] }
func (l *contentlexer) replace() { contentLength := len(l.content) var r rune for { if l.pos >= contentLength { l.width = 0 break } var width int = 1 r = rune(l.content[l.pos]) if r >= utf8.RuneSelf { r, width = utf8.DecodeRune(l.content[l.pos:]) } l.width = width l.pos += l.width if r == ' ' { l.prefixLookup.ms = matchStateWhitespace } else if l.prefixLookup.ms != matchStateNone { l.match(r) if l.prefixLookup.ms == matchStateFull { checkCandidate(l) } } } // Done! if l.pos > l.start { l.emit() } }
func checkCandidate(l *contentlexer) { isSource := l.prefixLookup.first == 's' for _, m := range l.matchers { if isSource && !m.isSourceType() || !isSource && m.isSourceType() { continue } if bytes.HasPrefix(l.content[l.pos:], m.match) { // check for schemaless URLs posAfter := l.pos + len(m.match) if posAfter >= len(l.content) { return } r, _ := utf8.DecodeRune(l.content[posAfter:]) if r == '/' { // schemaless: skip return } if l.pos > l.start { l.emit() } l.pos += len(m.match) l.w.Write(m.replacement) l.start = l.pos return } } }
// read reads blocks of 4096 bytes from the File, sending lines to the // channel as it encounters newlines. If EOF is encountered, the partial line // is returned to be concatenated with on the next call. func (t *Tailer) read(f afero.File, partialIn string) (partialOut string, err error) { partial := partialIn b := make([]byte, 0, 4096) for { n, err := f.Read(b[:cap(b)]) b = b[:n] if err != nil { return partial, err } for i, width := 0, 0; i < len(b) && i < n; i += width { var rune rune rune, width = utf8.DecodeRune(b[i:]) switch { case rune != '\n': partial += string(rune) default: // send off line for processing t.lines <- partial // reset accumulator partial = "" } } } }
func (t *tokenizer) nextChar() bool { if t.readOffset < len(t.src) { t.offset = t.readOffset ch := t.src[t.readOffset] r, w := rune(ch), 1 switch { case r == 0: t.error("illegal character NUL") case r >= 0x80: // not ASCII r, w = utf8.DecodeRune(t.src[t.offset:]) if r == utf8.RuneError && w == 1 { t.error("illegal UTF-8 encoding") } else if r == bom && t.offset > 0 { t.error("illegal byte order mark") } } if ch == '\n' { t.lineno++ } t.r = r t.readOffset += w return true } t.r = eof t.offset = len(t.src) return false }
func (t replaceTransformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { var runeBytes [utf8.UTFMax]byte for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] { if r = rune(src[0]); r < utf8.RuneSelf { sz = 1 } else { r, sz = utf8.DecodeRune(src) if sz == 1 { // Invalid rune. if !atEOF && !utf8.FullRune(src) { err = transform.ErrShortSrc break } } } dsz := utf8.EncodeRune(runeBytes[:], t(r)) if nDst+dsz > len(dst) { err = transform.ErrShortDst break } nDst += copy(dst[nDst:], runeBytes[:dsz]) nSrc += sz } return }
// Map returns a copy of the byte array s with all its characters modified // according to the mapping function. If mapping returns a negative value, the character is // dropped from the string with no replacement. The characters in s and the // output are interpreted as UTF-8-encoded Unicode code points. func Map(mapping func(r rune) rune, s []byte) []byte { // In the worst case, the array can grow when mapped, making // things unpleasant. But it's so rare we barge in assuming it's // fine. It could also shrink but that falls out naturally. maxbytes := len(s) // length of b nbytes := 0 // number of bytes encoded in b b := make([]byte, maxbytes) for i := 0; i < len(s); { wid := 1 r := rune(s[i]) if r >= utf8.RuneSelf { r, wid = utf8.DecodeRune(s[i:]) } r = mapping(r) if r >= 0 { if nbytes+utf8.RuneLen(r) > maxbytes { // Grow the buffer. maxbytes = maxbytes*2 + utf8.UTFMax nb := make([]byte, maxbytes) copy(nb, b[0:nbytes]) b = nb } nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r) } i += wid } return b[0:nbytes] }