Example #1
0
// FieldsFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
// It splits the array s at each run of code points c satisfying f(c) and
// returns a slice of subarrays of s.  If no code points in s satisfy f(c), an
// empty slice is returned.
func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
	n := 0
	inField := false
	for i := 0; i < len(s); {
		r, size := utf8.DecodeRune(s[i:])
		wasInField := inField
		inField = !f(r)
		if inField && !wasInField {
			n++
		}
		i += size
	}

	a := make([][]byte, n)
	na := 0
	fieldStart := -1
	for i := 0; i <= len(s) && na < n; {
		r, size := utf8.DecodeRune(s[i:])
		if fieldStart < 0 && size > 0 && !f(r) {
			fieldStart = i
			i += size
			continue
		}
		if fieldStart >= 0 && (size == 0 || f(r)) {
			a[na] = s[fieldStart:i]
			na++
			fieldStart = -1
		}
		if size == 0 {
			break
		}
		i += size
	}
	return a[0:na]
}
Example #2
0
func (t *Trie) Add(name string, data []int) {
	nm := []byte(name)
	if t.head == nil {
		t.init()
	}
	cur := t.head

	i := 0
	for i < len(nm) {
		r, size := utf8.DecodeRune(nm[i:])
		if _, ok := cur.children[r]; ok {
			cur = cur.children[r]
			i += size
		} else {
			break
		}
	}

	for i < len(nm) {
		r, size := utf8.DecodeRune(nm[i:])
		if i+size <= len(nm) {
			if _, ok := cur.children[r]; !ok && cur.name != name {
				cur.children[r] = &node{name[:i+size], nil, make(map[rune]*node)}
				cur = cur.children[r]
				i += size
			}
		}
	}

	cur.val = data
}
Example #3
0
func scanStmts(data []byte, atEOF bool) (advance int, token []byte, err error) {
	// Skip leading spaces.
	start := 0
	for width := 0; start < len(data); start += width {
		var r rune
		r, width = utf8.DecodeRune(data[start:])
		if !unicode.IsSpace(r) {
			break
		}
	}
	if atEOF && len(data) == 0 {
		return 0, nil, nil
	}

	end := start
	// Scan until semicolon, marking end of statement.
	for width, i := 0, start; i < len(data); i += width {
		var r rune
		r, width = utf8.DecodeRune(data[i:])
		if r == ';' {
			return i + width, data[start:i], nil
		} else if !unicode.IsSpace(r) {
			end = i + 1
		}
	}
	// If we're at EOF, we have a final, non-empty, non-terminated statement. Return it.
	if atEOF && len(data) > start {
		return len(data), data[start:end], nil
	}
	// Request more data.
	return 0, nil, nil
}
Example #4
0
// bufio.Scanner function to split data by words and quoted strings
func scanStrings(data []byte, atEOF bool) (advance int, token []byte, err error) {
	// Skip leading spaces.
	start := 0
	for width := 0; start < len(data); start += width {
		var r rune
		r, width = utf8.DecodeRune(data[start:])
		if !unicode.IsSpace(r) {
			break
		}
	}

	if atEOF && len(data) == 0 {
		return 0, nil, nil
	}

	// Scan until space, marking end of word.
	inquote := false
	for width, i := 0, start; i < len(data); i += width {
		var r rune
		r, width = utf8.DecodeRune(data[i:])
		if r == '"' {
			inquote = !inquote
			continue
		}
		if unicode.IsSpace(r) && !inquote {
			return i + width, data[start:i], nil
		}
	}
	// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.
	if atEOF && len(data) > start {
		return len(data), data[start:], nil
	}
	// Request more data.
	return 0, nil, nil
}
Example #5
0
// scanWordsKeepPrefix is a split function for a Scanner that returns each
// space-separated word of text, with prefixing spaces included. It will never
// return an empty string. The definition of space is set by unicode.IsSpace.
//
// Adapted from bufio.ScanWords().
func scanTokensKeepPrefix(data []byte, atEOF bool) (advance int, token []byte, err error) {
	// Skip leading spaces.
	start := 0
	for width := 0; start < len(data); start += width {
		var r rune
		r, width = utf8.DecodeRune(data[start:])
		if !unicode.IsSpace(r) {
			break
		}
	}
	if atEOF && len(data) == 0 || start == len(data) {
		return len(data), data, nil
	}
	if len(data) > start && data[start] == '#' {
		return scanLinesKeepPrefix(data, atEOF)
	}
	// Scan until space, marking end of word.
	for width, i := 0, start; i < len(data); i += width {
		var r rune
		r, width = utf8.DecodeRune(data[i:])
		if unicode.IsSpace(r) {
			return i, data[:i], nil
		}
	}
	// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.
	if atEOF && len(data) > start {
		return len(data), data, nil
	}
	// Request more data.
	return 0, nil, nil
}
Example #6
0
// scanWords is a split function for a Scanner that returns each
// space-separated word of text, with surrounding spaces deleted. It will
// never return an empty string. The definition of space is set by
// unicode.IsSpace.
func scanWords(data []byte, atEOF bool) (advance int, token []byte, err error) {
	// Skip leading spaces.
	start := 0
	for width := 0; start < len(data); start += width {
		var r rune
		r, width = utf8.DecodeRune(data[start:])
		if !unicode.IsSpace(r) {
			break
		}
	}
	quote := false
	// Scan until space, marking end of word.
	for width, i := 0, start; i < len(data); i += width {
		var r rune
		r, width = utf8.DecodeRune(data[i:])
		switch {
		case i == 0 && r == '"':
			quote = true
		case !quote && unicode.IsSpace(r):
			return i + width, data[start:i], nil
		case quote && r == '"':
			return i + width, data[start+width : i], nil
		}
	}
	// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.
	if atEOF && len(data) > start {
		return len(data), data[start:], nil
	}
	// Request more data.
	return start, nil, nil
}
Example #7
0
func iter_words(data []byte, cb func(word []byte)) {
	for {
		if len(data) == 0 {
			return
		}

		r, rlen := utf8.DecodeRune(data)
		// skip non-word runes
		for !is_word(r) {
			data = data[rlen:]
			if len(data) == 0 {
				return
			}
			r, rlen = utf8.DecodeRune(data)
		}

		// must be on a word rune
		i := 0
		for is_word(r) && i < len(data) {
			i += rlen
			r, rlen = utf8.DecodeRune(data[i:])
		}
		cb(data[:i])
		data = data[i:]
	}
}
Example #8
0
File: in.go Project: pschlump/lexie
// pat = EscapeNormalString(pat)
func EscapeNormalString(in string) (rv string) {
	rv = ""
	var c rune
	var sz int

	for i := 0; i < len(in); i += sz {
		c, sz = utf8.DecodeRune([]byte(in[i:]))
		if c == '\\' {
			i += sz
			c, sz = utf8.DecodeRune([]byte(in[i:]))
			switch c {
			case 'n':
				rv += "\n"
			case 't':
				rv += "\t"
			case 'f':
				rv += "\f"
			case 'r':
				rv += "\r"
			case 'v':
				rv += "\v"
			default:
				rv += string(c)
			}
		} else {
			rv += string(c)
		}
	}
	return
}
Example #9
0
File: scan.go Project: Greentor/go
// ScanWords is a split function for a Scanner that returns each
// space-separated word of text, with surrounding spaces deleted. It will
// never return an empty string. The definition of space is set by
// unicode.IsSpace.
func ScanWords(data []byte, atEOF bool) (advance int, token []byte, err error) {
	// Skip leading spaces.
	start := 0
	for width := 0; start < len(data); start += width {
		var r rune
		r, width = utf8.DecodeRune(data[start:])
		if !isSpace(r) {
			break
		}
	}
	// Scan until space, marking end of word.
	for width, i := 0, start; i < len(data); i += width {
		var r rune
		r, width = utf8.DecodeRune(data[i:])
		if isSpace(r) {
			return i + width, data[start:i], nil
		}
	}
	// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.
	if atEOF && len(data) > start {
		return len(data), data[start:], nil
	}
	// Request more data.
	return start, nil, nil
}
func (self *WordDict) AddReplaceRule(rule []byte) {
	if utf8.RuneCount(rule) != 2 {
		self.Panic("rule format differs from '=xX'")
	}
	from, fromSize := utf8.DecodeRune(rule)
	to, _ := utf8.DecodeRune(rule[fromSize:])
	self.runeMapping[from] = to
}
Example #11
0
func init() {
	// setup the required runes
	colon, _ = utf8.DecodeRune([]byte(":"))
	dash, _ = utf8.DecodeRune([]byte("-"))
	period, _ = utf8.DecodeRune([]byte("."))
	slash, _ = utf8.DecodeRune([]byte("/"))
	underscore, _ = utf8.DecodeRune([]byte("_"))
}
Example #12
0
func (lr *lexlReader) ScanRune(read bool) (rune, error) {
	fmt.Println("SCAN RUNE")
	if lr.size < 4 {
		fmt.Println(" < 4 fill")
		err := lr.attemptFill()
		if err != nil {
			fmt.Println("SCAN ERR")
			return 0, err
		}
		fmt.Println(" < 4 fill done")
	}
	fmt.Printf("lr.size: %d\n", lr.size)
	if lr.size == 0 {
		return 0, io.EOF
	}
	if len(lr.buf)-lr.pos < 4 {
		fmt.Println("END BUFFERING")
		nbuf := make([]byte, 4)
		nlen := 4
		if nlen > lr.size {
			nlen = lr.size
		}
		npos := lr.pos
		for i := 0; i < nlen; i++ {
			nbuf[i] = lr.buf[npos]
			npos++
			if npos >= len(lr.buf) {
				npos -= len(lr.buf)
			}
		}
		r, ns := utf8.DecodeRune(nbuf)
		if r == utf8.RuneError {
			return 0, errors.New("stream does not decode a utf-8 character")
		}
		if read {
			lr.pos += ns
			lr.size -= ns
			if lr.pos >= len(lr.buf) {
				lr.pos -= len(lr.buf)
			}
		}
		return r, nil
	}
	fmt.Println("DECODING FROM BUFFER")
	r, ns := utf8.DecodeRune(lr.buf[lr.pos:])
	if r == utf8.RuneError {
		return 0, errors.New("stream does not decode a utf-8 character")
	}
	if read {
		lr.pos += ns
		lr.size -= ns
		if lr.pos >= len(lr.buf) {
			lr.pos -= len(lr.buf)
		}
	}
	return r, nil
}
Example #13
0
func CompareChars(word string) {
	s := []byte(word)
	for utf8.RuneCount(s) > 1 {
		r, size := utf8.DecodeRune(s)
		s = s[size:]
		nextR, size := utf8.DecodeRune(s)
		fmt.Print(r == nextR, ",")
	}
	fmt.Println()
}
Example #14
0
func NewInput(in io.Reader) *Input {
	input := &Input{
		"",
		bufio.NewScanner(in),
	}

	split := func(data []byte, atEOF bool) (advance int, token []byte, err error) {
		//fmt.Println("Input:", string(data))
		// Skip leading spaces.
		start := 0
		for width := 0; start < len(data); start += width {
			var r rune
			r, width = utf8.DecodeRune(data[start:])
			if !IsWhitespace(r) {
				break
			}
		}
		if atEOF && len(data[start:]) == 0 {
			//fmt.Println("need more data 1")
			return 0, nil, nil
		}

		//fmt.Println("After WS Skip:", string(data[start:]))

		var r rune
		var width int
		r, width = utf8.DecodeRune(data[start:])
		if r == '(' || r == ')' {
			//fmt.Println("returning token:", string(data[start:start+width]))
			return start + width, data[start : start+width], nil
		}
		//fmt.Println("After paren check:", string(data[start:]))

		// Scan until space, marking end of word.
		for width, i := 0, start; i < len(data); i += width {
			r, width = utf8.DecodeRune(data[i:])
			//fmt.Printf("rune %d: %s\n", i, string(r))
			if IsWhitespace(r) || r == '(' || r == ')' {
				//fmt.Println("returning token:", string(data[start:i]))
				return i, data[start:i], nil
			}
		}
		// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.
		if atEOF && len(data) > start {
			//fmt.Println("returning token:", string(data[start:]))
			return len(data) - start, data[start:], nil
		}
		// Request more data.
		//fmt.Println("need more data 2")
		return 0, nil, nil
	}
	input.Split(split)

	return input
}
Example #15
0
func (src *Src) Consume(match ConsumeFunc) string {
	buf := src.Bytes()
	var m int
	for r, n := utf8.DecodeRune(buf); r != utf8.RuneError; r, n = utf8.DecodeRune(buf) {
		if !match(r) {
			break
		}
		buf = buf[n:]
		m += n
	}
	return src.SkipString(m)
}
Example #16
0
// EqualFold reports whether s and t, interpreted as UTF-8 strings,
// are equal under Unicode case-folding.
func EqualFold(s, t []byte) bool {
	for len(s) != 0 && len(t) != 0 {
		// Extract first rune from each.
		var sr, tr rune
		if s[0] < utf8.RuneSelf {
			sr, s = rune(s[0]), s[1:]
		} else {
			r, size := utf8.DecodeRune(s)
			sr, s = r, s[size:]
		}
		if t[0] < utf8.RuneSelf {
			tr, t = rune(t[0]), t[1:]
		} else {
			r, size := utf8.DecodeRune(t)
			tr, t = r, t[size:]
		}

		// If they match, keep going; if not, return false.

		// Easy case.
		if tr == sr {
			continue
		}

		// Make sr < tr to simplify what follows.
		if tr < sr {
			tr, sr = sr, tr
		}
		// Fast check for ASCII.
		if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' {
			// ASCII, and sr is upper case.  tr must be lower case.
			if tr == sr+'a'-'A' {
				continue
			}
			return false
		}

		// General case.  SimpleFold(x) returns the next equivalent rune > x
		// or wraps around to smaller values.
		r := unicode.SimpleFold(sr)
		for r != sr && r < tr {
			r = unicode.SimpleFold(r)
		}
		if r == tr {
			continue
		}
		return false
	}

	// One string is empty.  Are both?
	return len(s) == len(t)
}
Example #17
0
// Creates a scanner that splits on words or quoted strings
func NewQuotedScanner(r io.Reader) *bufio.Scanner {
	scanner := bufio.NewScanner(r)
	split := func(data []byte, atEOF bool) (advance int, token []byte, err error) {
		// Skip leading spaces.
		start := 0
		for width := 0; start < len(data); start += width {
			var r rune
			r, width = utf8.DecodeRune(data[start:])
			if !unicode.IsSpace(r) {
				break
			}
		}

		// Does word start with a quote?
		quote, width := utf8.DecodeRune(data[start:])
		i := start
		if IsQuote(quote) {
			log.Debugf("Quote detected '%c'", quote)
			i = i + width
		} else {
			quote = 0

		}

		// Scan until space, marking end of word.
		for width := 0; i < len(data); i += width {
			var r rune
			r, width = utf8.DecodeRune(data[i:])
			if quote == 0 {
				if unicode.IsSpace(r) {
					return i + width, data[start:i], nil
				}
			} else {
				// Look for ending quote
				// BUG: need to implement escape handling
				if r == quote {
					log.Debugf("Found end quote %d chars after start", i)
					quote = 0
				}
			}
		}
		// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.
		if atEOF && len(data) > start {
			return len(data), data[start:], nil
		}
		// Request more data.
		return start, nil, nil
	}
	scanner.Split(split)
	return scanner
}
Example #18
0
func (f *commaSeparated) Open(r io.Reader) error {
	f.reader = r
	f.csvReader = csv.NewReader(r)

	if f.FieldDelim != "" {
		f.csvReader.Comma, _ = utf8.DecodeRune([]byte(f.FieldDelim))
	}
	if f.Comment != "" {
		f.csvReader.Comment, _ = utf8.DecodeRune([]byte(f.Comment))
	}
	f.csvReader.FieldsPerRecord = f.NumFields

	return nil
}
Example #19
0
func findUnescaped(toFind, escape rune, data []byte, from int) int {
	for i := from; i < len(data); {
		r, sz := utf8.DecodeRune(data[i:])
		i += sz
		if r == escape {
			// skip next char
			_, sz = utf8.DecodeRune(data[i:])
			i += sz
		} else if r == toFind {
			return i - sz
		}
	}
	return -1
}
Example #20
0
// splitFunc is a wrapper around bufio.SplitFunc that calculates line and
// column information for tokens.
func (t *Tokenizer) splitFunc() bufio.SplitFunc {
	line, column := 1, 1
	lastLine, lastColumn := -1, -1
	return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
		// Skip leading non-characters.
		start := 0
		for width := 0; start < len(data); start += width {
			var r rune
			r, width = utf8.DecodeRune(data[start:])
			if t.tokenizerFunc(r) {
				break
			}
			if r == '\n' {
				line += 1
				column = 1
			} else {
				column += 1
			}
		}
		if lastLine == -1 {
			lastLine, lastColumn = line, column
		}
		// Scan until first non-character, marking end of token.
		for width, i := 0, start; i < len(data); i += width {
			var r rune
			r, width = utf8.DecodeRune(data[i:])
			if r == '\n' {
				line += 1
				column = 1
			} else {
				column += 1
			}
			if !t.tokenizerFunc(r) {
				t.line, t.column = lastLine, lastColumn
				lastLine, lastColumn = -1, -1
				return i + width, data[start:i], nil
			}
		}
		// If we're at EOF, we have a final, non-empty, non-terminated token. Return it.
		if atEOF && len(data) > start {
			t.line, t.column = lastLine, lastColumn
			lastLine, lastColumn = -1, -1
			return len(data), data[start:], nil
		}
		// Request more data.
		return start, nil, nil
	}
}
Example #21
0
// Test that the rune splitter returns same sequence of runes (not bytes) as for range string.
func TestScanRune(t *testing.T) {
	for n, test := range scanTests {
		buf := bytes.NewBufferString(test)
		s := NewScanner(buf)
		s.Split(ScanRunes)
		var i, runeCount int
		var expect rune
		// Use a string range loop to validate the sequence of runes.
		for i, expect = range string(test) {
			if !s.Scan() {
				break
			}
			runeCount++
			got, _ := utf8.DecodeRune(s.Bytes())
			if got != expect {
				t.Errorf("#%d: %d: expected %q got %q", n, i, expect, got)
			}
		}
		if s.Scan() {
			t.Errorf("#%d: scan ran too long, got %q", n, s.Text())
		}
		testRuneCount := utf8.RuneCountInString(test)
		if runeCount != testRuneCount {
			t.Errorf("#%d: termination expected at %d; got %d", n, testRuneCount, runeCount)
		}
		err := s.Err()
		if err != nil {
			t.Errorf("#%d: %v", n, err)
		}
	}
}
Example #22
0
File: scan.go Project: Greentor/go
// ScanRunes is a split function for a Scanner that returns each
// UTF-8-encoded rune as a token. The sequence of runes returned is
// equivalent to that from a range loop over the input as a string, which
// means that erroneous UTF-8 encodings translate to U+FFFD = "\xef\xbf\xbd".
// Because of the Scan interface, this makes it impossible for the client to
// distinguish correctly encoded replacement runes from encoding errors.
func ScanRunes(data []byte, atEOF bool) (advance int, token []byte, err error) {
	if atEOF && len(data) == 0 {
		return 0, nil, nil
	}

	// Fast path 1: ASCII.
	if data[0] < utf8.RuneSelf {
		return 1, data[0:1], nil
	}

	// Fast path 2: Correct UTF-8 decode without error.
	_, width := utf8.DecodeRune(data)
	if width > 1 {
		// It's a valid encoding. Width cannot be one for a correctly encoded
		// non-ASCII rune.
		return width, data[0:width], nil
	}

	// We know it's an error: we have width==1 and implicitly r==utf8.RuneError.
	// Is the error because there wasn't a full rune to be decoded?
	// FullRune distinguishes correctly between erroneous and incomplete encodings.
	if !atEOF && !utf8.FullRune(data) {
		// Incomplete; get more bytes.
		return 0, nil, nil
	}

	// We have a real UTF-8 encoding error. Return a properly encoded error rune
	// but advance only one byte. This matches the behavior of a range loop over
	// an incorrectly encoded string.
	return 1, errorRune, nil
}
Example #23
0
// utf7enc converts string s from UTF-8 to UTF-16-BE, encodes the result as
// Base64, removes the padding, and adds UTF-7 shifts.
func utf7enc(s []byte) []byte {
	// len(s) is sufficient for UTF-8 to UTF-16 conversion if there are no
	// control code points (see table below).
	b := make([]byte, 0, len(s)+4)
	for len(s) > 0 {
		r, size := utf8.DecodeRune(s)
		if r > utf8.MaxRune {
			r, size = utf8.RuneError, 1 // Bug fix (issue 3785)
		}
		s = s[size:]
		if r1, r2 := utf16.EncodeRune(r); r1 != uRepl {
			b = append(b, byte(r1>>8), byte(r1))
			r = r2
		}
		b = append(b, byte(r>>8), byte(r))
	}

	// Encode as Base64
	n := u7enc.EncodedLen(len(b)) + 2
	b64 := make([]byte, n)
	u7enc.Encode(b64[1:], b)

	// Strip padding
	n -= 2 - (len(b)+2)%3
	b64 = b64[:n]

	// Add UTF-7 shifts
	b64[0] = '&'
	b64[n-1] = '-'
	return b64
}
Example #24
0
// toLowerDeferredCopy will function exactly like
// bytes.ToLower() only it will reuse (overwrite)
// the original byte array when possible
// NOTE: because its possible that the lower-case
// form of a rune has a different utf-8 encoded
// length, in these cases a new byte array is allocated
func toLowerDeferredCopy(s []byte) []byte {
	j := 0
	for i := 0; i < len(s); {
		wid := 1
		r := rune(s[i])
		if r >= utf8.RuneSelf {
			r, wid = utf8.DecodeRune(s[i:])
		}
		l := unicode.ToLower(r)
		lwid := utf8.RuneLen(l)
		if lwid > wid {
			// utf-8 encoded replacement is wider
			// for now, punt and defer
			// to bytes.ToLower() for the remainder
			// only known to happen with chars
			//   Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3
			//   Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3
			rest := bytes.ToLower(s[i:])
			rv := make([]byte, j+len(rest))
			copy(rv[:j], s[:j])
			copy(rv[j:], rest)
			return rv
		} else {
			utf8.EncodeRune(s[j:], l)
		}
		i += wid
		j += lwid
	}
	return s[:j]
}
Example #25
0
func (l *contentlexer) replace() {
	contentLength := len(l.content)
	var r rune

	for {
		if l.pos >= contentLength {
			l.width = 0
			break
		}

		var width int = 1
		r = rune(l.content[l.pos])
		if r >= utf8.RuneSelf {
			r, width = utf8.DecodeRune(l.content[l.pos:])
		}
		l.width = width
		l.pos += l.width
		if r == ' ' {
			l.prefixLookup.ms = matchStateWhitespace
		} else if l.prefixLookup.ms != matchStateNone {
			l.match(r)
			if l.prefixLookup.ms == matchStateFull {
				checkCandidate(l)
			}
		}

	}

	// Done!
	if l.pos > l.start {
		l.emit()
	}
}
Example #26
0
func checkCandidate(l *contentlexer) {
	isSource := l.prefixLookup.first == 's'
	for _, m := range l.matchers {

		if isSource && !m.isSourceType() || !isSource && m.isSourceType() {
			continue
		}

		if bytes.HasPrefix(l.content[l.pos:], m.match) {
			// check for schemaless URLs
			posAfter := l.pos + len(m.match)
			if posAfter >= len(l.content) {
				return
			}
			r, _ := utf8.DecodeRune(l.content[posAfter:])
			if r == '/' {
				// schemaless: skip
				return
			}
			if l.pos > l.start {
				l.emit()
			}
			l.pos += len(m.match)
			l.w.Write(m.replacement)
			l.start = l.pos
			return

		}
	}
}
Example #27
0
// read reads blocks of 4096 bytes from the File, sending lines to the
// channel as it encounters newlines.  If EOF is encountered, the partial line
// is returned to be concatenated with on the next call.
func (t *Tailer) read(f afero.File, partialIn string) (partialOut string, err error) {
	partial := partialIn
	b := make([]byte, 0, 4096)
	for {
		n, err := f.Read(b[:cap(b)])
		b = b[:n]
		if err != nil {
			return partial, err
		}

		for i, width := 0, 0; i < len(b) && i < n; i += width {
			var rune rune
			rune, width = utf8.DecodeRune(b[i:])
			switch {
			case rune != '\n':
				partial += string(rune)
			default:
				// send off line for processing
				t.lines <- partial
				// reset accumulator
				partial = ""
			}
		}
	}
}
Example #28
0
func (t *tokenizer) nextChar() bool {
	if t.readOffset < len(t.src) {
		t.offset = t.readOffset
		ch := t.src[t.readOffset]

		r, w := rune(ch), 1
		switch {
		case r == 0:
			t.error("illegal character NUL")
		case r >= 0x80:
			// not ASCII
			r, w = utf8.DecodeRune(t.src[t.offset:])
			if r == utf8.RuneError && w == 1 {
				t.error("illegal UTF-8 encoding")
			} else if r == bom && t.offset > 0 {
				t.error("illegal byte order mark")
			}
		}

		if ch == '\n' {
			t.lineno++
		}

		t.r = r
		t.readOffset += w
		return true
	}

	t.r = eof
	t.offset = len(t.src)
	return false
}
Example #29
0
func (t replaceTransformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	var runeBytes [utf8.UTFMax]byte
	for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] {

		if r = rune(src[0]); r < utf8.RuneSelf {
			sz = 1
		} else {
			r, sz = utf8.DecodeRune(src)

			if sz == 1 {
				// Invalid rune.
				if !atEOF && !utf8.FullRune(src) {
					err = transform.ErrShortSrc
					break
				}
			}
		}

		dsz := utf8.EncodeRune(runeBytes[:], t(r))
		if nDst+dsz > len(dst) {
			err = transform.ErrShortDst
			break
		}

		nDst += copy(dst[nDst:], runeBytes[:dsz])
		nSrc += sz
	}
	return
}
Example #30
0
// Map returns a copy of the byte array s with all its characters modified
// according to the mapping function. If mapping returns a negative value, the character is
// dropped from the string with no replacement.  The characters in s and the
// output are interpreted as UTF-8-encoded Unicode code points.
func Map(mapping func(r rune) rune, s []byte) []byte {
	// In the worst case, the array can grow when mapped, making
	// things unpleasant.  But it's so rare we barge in assuming it's
	// fine.  It could also shrink but that falls out naturally.
	maxbytes := len(s) // length of b
	nbytes := 0        // number of bytes encoded in b
	b := make([]byte, maxbytes)
	for i := 0; i < len(s); {
		wid := 1
		r := rune(s[i])
		if r >= utf8.RuneSelf {
			r, wid = utf8.DecodeRune(s[i:])
		}
		r = mapping(r)
		if r >= 0 {
			if nbytes+utf8.RuneLen(r) > maxbytes {
				// Grow the buffer.
				maxbytes = maxbytes*2 + utf8.UTFMax
				nb := make([]byte, maxbytes)
				copy(nb, b[0:nbytes])
				b = nb
			}
			nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r)
		}
		i += wid
	}
	return b[0:nbytes]
}