func ExampleRuneLen() { fmt.Println(utf8.RuneLen('a')) fmt.Println(utf8.RuneLen('界')) // Output: // 1 // 3 }
// RankMatch is similar to Match except it will measure the Levenshtein // distance between the source and the target and return its result. If there // was no match, it will return -1. // Given the requirements of match, RankMatch only needs to perform a subset of // the Levenshtein calculation, only deletions need be considered, required // additions and substitutions would fail the match test. func RankMatch(source, target string) int { lenDiff := len(target) - len(source) if lenDiff < 0 { return -1 } if lenDiff == 0 && source == target { return 0 } runeDiff := 0 Outer: for _, r1 := range source { for i, r2 := range target { if r1 == r2 { target = target[i+utf8.RuneLen(r2):] continue Outer } else { runeDiff++ } } return -1 } // Count up remaining char for len(target) > 0 { target = target[utf8.RuneLen(rune(target[0])):] runeDiff++ } return runeDiff }
// normalizeBidi attempts to prevent names from using bidi control codes to // screw up our layout func normalizeBidi(name string) string { bidiExplicitDepth := 0 bidiIsolateDepth := 0 for _, c := range name { switch c { case ltrEmbed, rtlEmbed, ltrOverride, rtlOverride: bidiExplicitDepth++ case bidiExplicitPop: bidiExplicitDepth-- case ltrIsolate, rtlIsolate, fsIsolate: bidiIsolateDepth++ case bidiIsolatePop: bidiIsolateDepth-- } } if bidiExplicitDepth+bidiIsolateDepth > 0 { pops := make([]byte, bidiExplicitDepth*utf8.RuneLen(bidiExplicitPop)+bidiIsolateDepth+utf8.RuneLen(bidiIsolatePop)) i := 0 for ; bidiExplicitDepth > 0; bidiExplicitDepth-- { i += utf8.EncodeRune(pops[i:], bidiExplicitPop) } for ; bidiIsolateDepth > 0; bidiIsolateDepth-- { i += utf8.EncodeRune(pops[i:], bidiIsolatePop) } return name + string(pops[:i]) } return name }
func reverse(s []byte) []byte { rest := subslice{s, 0, len(s)} result := subslice{s, 0, len(s)} // note: que は高々サイズ4の []rune que := queue{} for !rest.empty() { // 後ろから要素を取り出して... r := rest.popBack() // 先頭に十分な空きができるまで先頭の要素をキューに追加 for !rest.empty() && frontInsertSpace(rest, result) < utf8.RuneLen(r) { que.push(rest.popFront()) } // 後ろから取り出した要素を先頭に移動 result.pushFront(r) // 先頭から取り出した要素を詰めれるだけ後ろに逆順で詰める for len(que) > 0 { if backInsertSpace(rest, result) < utf8.RuneLen(que.front()) { break } result.pushBack(que.front()) que.pop() } } // 取り出せる要素が無くなったら,キューの要素を余った隙間に後ろに逆順で追加 for len(que) > 0 { result.pushBack(que.front()) que.pop() } return s }
func TestCharcount(t *testing.T) { var tests = []struct { input string counts map[rune]int utflen [utf8.UTFMax + 1]int invalid int }{ {"Hello", map[rune]int{'H': 1, 'e': 1, 'l': 2, 'o': 1}, utflenFromMap(map[int]int{1: 5}), 0}, {"あ", map[rune]int{'あ': 1}, utflenFromMap(map[int]int{utf8.RuneLen('あ'): 1}), 0}, { "あiueお", map[rune]int{'あ': 1, 'i': 1, 'u': 1, 'e': 1, 'お': 1}, utflenFromMap(map[int]int{utf8.RuneLen('あ'): 2, 1: 3}), 0, }, } for _, test := range tests { in := bufio.NewReader(strings.NewReader(test.input)) counts, utflen, invalid, err := charcount(in) if !reflect.DeepEqual(counts, test.counts) || utflen != test.utflen || invalid != test.invalid || err != nil { t.Errorf("charcount(%q) = %v, %v, %v, %v ; want %v, %v, %v, nil", test.input, counts, utflen, invalid, err, test.counts, test.utflen, test.invalid) } } }
// htmlReplacer returns s with runes replaced according to replacementTable // and when badRunes is true, certain bad runes are allowed through unescaped. func htmlReplacer(s string, replacementTable []string, badRunes bool) string { written, b := 0, new(bytes.Buffer) for i, r := range s { if int(r) < len(replacementTable) { if repl := replacementTable[r]; len(repl) != 0 { b.WriteString(s[written:i]) b.WriteString(repl) // Valid as long as replacementTable doesn't // include anything above 0x7f. written = i + utf8.RuneLen(r) } } else if badRunes { // No-op. // IE does not allow these ranges in unquoted attrs. } else if 0xfdd0 <= r && r <= 0xfdef || 0xfff0 <= r && r <= 0xffff { fmt.Fprintf(b, "%s&#x%x;", s[written:i], r) written = i + utf8.RuneLen(r) } } if written == 0 { return s } b.WriteString(s[written:]) return b.String() }
// compileGlobPattern takes a given pattern string consisting of typical // wildcard characters *, ?, or any literal string and returns a compiled slice // of scanner functions. // // Any character in the pattern string can be escaped using a backslash to // produce the literal character following it rather than a special character. func compileGlobPattern(pattern string) ([]*globScanner, error) { // compile scanner function array wildcards := make([]*globScanner, 0, 4) for index, code := range pattern { var fn scanFunc = nil var start int = -1 var kind globKind switch { case code == '\\': fn = consumeSubstring kind = globString case code == '*': fn = consumeAllPreceding kind = globMany case code == '?': fn = consumeOnePreceding kind = globOne case index == 0: fn = consumeSubstring start = index kind = globString default: continue } numWildcards := len(wildcards) if numWildcards > 0 { last := wildcards[numWildcards-1] if (kind == globOne || kind == globMany) && last.kind == globMany && last.start == index { return nil, ErrInvalidGlobSequence } else if code == '\\' && len(last.substr) == 0 { last.start += utf8.RuneLen(code) continue } else { last.substr = pattern[last.start:index] } } if start == -1 { start = index + utf8.RuneLen(code) } wildcards = append(wildcards, &globScanner{fn, kind, "", start}) } numWildcards := len(wildcards) if numWildcards > 0 { last := wildcards[numWildcards-1] last.substr = pattern[last.start:] } wildcards = append(wildcards, &globScanner{consumeEnd, globEnd, "", len(pattern)}) return wildcards, nil }
// Write a rune to the underlying slice. If the rune is invalid, then the // RuneError symbol is written. The rune is only written if there is available // buffer space, otherwise ErrShortWrite is returned. func (w *Writer) WriteRune(r rune) (cnt int, err error) { cnt = utf8.RuneLen(r) if cnt == -1 { r = utf8.RuneError cnt = utf8.RuneLen(r) } if availCnt := int64(len(w.buf)) - w.idx; availCnt < int64(cnt) { return 0, io.ErrShortWrite } cnt = utf8.EncodeRune(w.buf[w.idx:], r) w.idx += int64(cnt) return cnt, nil }
/** * Lexer::BackupRunes */ func (l *lexer) BackupRunes(n int) { for ; n > 0; n-- { if l.pos > 0 { l.pos-- i := l.runes.Peek(l.pos) // 0-based r := i.(rune) l.tokenLen -= utf8.RuneLen(r) l.column -= utf8.RuneLen(r) } else { panic("Underflow Exception") } } }
func (p *parser) handlePreEscape(char rune) { switch char { case '[': p.instructionStartedAt = p.cursor + utf8.RuneLen('[') p.instructions = make([]string, 0, 1) p.mode = MODE_ESCAPE case ']': p.instructionStartedAt = p.cursor + utf8.RuneLen('[') p.mode = MODE_ITERM_ESCAPE default: // Not an escape code, false alarm p.cursor = p.escapeStartedAt p.mode = MODE_NORMAL } }
func (self *_lexer) skip(count int) { read := self.readIn[self.tail : self.tail+count] for _, chr := range read { self.tail += 1 self.tailOffset += utf8.RuneLen(chr) } }
func (p *Parser) parseShort(s *parseState, optname string, argument *string) (option *Option, err error) { if argument == nil { optname, argument = p.splitShortConcatArg(s, optname) } for i, c := range optname { shortname := string(c) if option = s.lookup.shortNames[shortname]; option != nil { // Only the last short argument can consume an argument from // the arguments list, and only if it's non optional canarg := (i+utf8.RuneLen(c) == len(optname)) && !option.OptionalArgument if _, err := p.parseOption(s, shortname, option, canarg, argument); err != nil { return option, err } } else { return nil, newError(ErrUnknownFlag, fmt.Sprintf("unknown flag `%s'", shortname)) } // Only the first option can have a concatted argument, so just // clear argument here argument = nil } return option, nil }
func main() { s := "¶ Greetings!" r, l := utf8.DecodeRuneInString(s) l2 := utf8.RuneLen(r) ok := utf8.ValidString(s) fmt.Printf("rune %c length %d = %d ok %t\n", r, l, l2, ok) }
func splitPathOnSeparator(path string, separator rune) []string { // if the separator is '\\', then we can just split... if separator == '\\' { return strings.Split(path, string(separator)) } // otherwise, we need to be careful of situations where the separator was escaped cnt := strings.Count(path, string(separator)) if cnt == 0 { return []string{path} } ret := make([]string, cnt+1) pathlen := len(path) separatorLen := utf8.RuneLen(separator) idx := 0 for start := 0; start < pathlen; { end := indexRuneWithEscaping(path[start:], separator) if end == -1 { end = pathlen } else { end += start } ret[idx] = path[start:end] start = end + separatorLen idx++ } return ret[:idx] }
// processEscape processes a single escape sequence and returns number of bytes processed. func (r *Lexer) processEscape(data []byte) (int, error) { if len(data) < 2 { return 0, fmt.Errorf("syntax error at %v", string(data)) } c := data[1] switch c { case '"', '/', '\\': r.token.byteValue = append(r.token.byteValue, c) return 2, nil case 'b': r.token.byteValue = append(r.token.byteValue, '\b') return 2, nil case 'f': r.token.byteValue = append(r.token.byteValue, '\f') return 2, nil case 'n': r.token.byteValue = append(r.token.byteValue, '\n') return 2, nil case 'r': r.token.byteValue = append(r.token.byteValue, '\r') return 2, nil case 't': r.token.byteValue = append(r.token.byteValue, '\t') return 2, nil case 'u': default: return 0, fmt.Errorf("syntax error") } var val rune for i := 2; i < len(data) && i < 6; i++ { var v byte c = data[i] switch c { case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': v = c - '0' case 'a', 'b', 'c', 'd', 'e', 'f': v = c - 'a' + 10 case 'A', 'B', 'C', 'D', 'E', 'F': v = c - 'A' + 10 default: return 0, fmt.Errorf("syntax error") } val <<= 4 val |= rune(v) } l := utf8.RuneLen(val) if l == -1 { return 0, fmt.Errorf("invalid unicode escape") } var d [4]byte utf8.EncodeRune(d[:], val) r.token.byteValue = append(r.token.byteValue, d[:l]...) return 6, nil }
// Squash squashes each run of adjacent Unicode spaces in a UTF-8 // encoded []byte slice into a single ASCII space. It is an "in-place" // function (see 4.22, p. 91) in that it modifies elements of the // slice "in-place". func Squash(input []byte) []byte { i := 0 runes := string(input) prevSpace := false // was the previous rune a space? for _, r := range runes { if unicode.IsSpace(r) || r == ' ' { if prevSpace { continue } else { prevSpace = true r = ' ' // convert to an ascii space } } else { prevSpace = false } if utf8.RuneLen(r) > 1 { buf := make([]byte, 3) j := i + utf8.EncodeRune(buf, r) copy(input[i:j], buf) i = j } else { input[i] = byte(r) i++ } } return input[:i] }
// Validate tests whether the stream name is valid. func (s StreamName) Validate() error { if len(s) == 0 { return errors.New("Must contain at least one character.") } var lastRune rune var segmentIdx int for idx, r := range s { // Alphanumeric. if !isAlnum(r) { // The stream name must begin with an alphanumeric character. if idx == segmentIdx { return fmt.Errorf("Segment (at %d) must begin with alphanumeric character.", segmentIdx) } // Test forward slash, and ensure no adjacent forward slashes. if r == StreamNameSep { segmentIdx = idx + utf8.RuneLen(r) } else if !(r == '.' || r == '_' || r == '-' || r == ':') { // Test remaining allowed characters. return fmt.Errorf("Illegal charater (%c) at index %d.", r, idx) } } lastRune = r } // The last rune may not be a separator. if lastRune == StreamNameSep { return errors.New("Name may not end with a separator.") } return nil }
// urlEncodePath encode the strings from UTF-8 byte representations to HTML hex escape sequences // // This is necessary since regular url.Parse() and url.Encode() functions do not support UTF-8 // non english characters cannot be parsed due to the nature in which url.Encode() is written // // This function on the other hand is a direct replacement for url.Encode() technique to support // pretty much every UTF-8 character. func urlEncodePath(pathName string) string { // if object matches reserved string, no need to encode them reservedNames := regexp.MustCompile("^[a-zA-Z0-9-_.~/]+$") if reservedNames.MatchString(pathName) { return pathName } var encodedPathname string for _, s := range pathName { if 'A' <= s && s <= 'Z' || 'a' <= s && s <= 'z' || '0' <= s && s <= '9' { // §2.3 Unreserved characters (mark) encodedPathname = encodedPathname + string(s) continue } switch s { case '-', '_', '.', '~', '/': // §2.3 Unreserved characters (mark) encodedPathname = encodedPathname + string(s) continue default: len := utf8.RuneLen(s) if len < 0 { // if utf8 cannot convert return the same string as is return pathName } u := make([]byte, len) utf8.EncodeRune(u, s) for _, r := range u { hex := hex.EncodeToString([]byte{r}) encodedPathname = encodedPathname + "%" + strings.ToUpper(hex) } } } return encodedPathname }
// toLowerDeferredCopy will function exactly like // bytes.ToLower() only it will reuse (overwrite) // the original byte array when possible // NOTE: because its possible that the lower-case // form of a rune has a different utf-8 encoded // length, in these cases a new byte array is allocated func toLowerDeferredCopy(s []byte) []byte { j := 0 for i := 0; i < len(s); { wid := 1 r := rune(s[i]) if r >= utf8.RuneSelf { r, wid = utf8.DecodeRune(s[i:]) } l := unicode.ToLower(r) lwid := utf8.RuneLen(l) if lwid > wid { // utf-8 encoded replacement is wider // for now, punt and defer // to bytes.ToLower() for the remainder // only known to happen with chars // Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3 // Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3 rest := bytes.ToLower(s[i:]) rv := make([]byte, j+len(rest)) copy(rv[:j], s[:j]) copy(rv[j:], rest) return rv } else { utf8.EncodeRune(s[j:], l) } i += wid j += lwid } return s[:j] }
// setRune writes a rune at the given point, relative to the view. It // checks if the position is valid and applies the view's colors, taking // into account if the cell must be highlighted. func (v *View) getRuneLen(ch rune) int { if utf8.RuneLen(ch) > 1 { return 2 } else { return 1 } }
// takeRune counts a rune towards line/column/offset measurements for tokens. Unless an error occurs, all runes read should be passed, // in order, to this function. func (l *Lexer) takeRune(r rune, size int) error { if size <= 0 { size = utf8.RuneLen(r) } if size == -1 { panic(fmt.Errorf("Lexer: takeRune: rune %q has encoded length %d", r, size)) } l.loc.Off += size if isNewline(l.last) { l.loc.Line++ l.loc.Col = 1 } else { l.loc.Col++ } if l.shouldCaptureReads() { n, err := writeRune(&l.buf, r) if size != n || err != nil { log.Printf("Error buffering rune %q: (size=%d written=%d) %v", r, size, n, err) } return err } return nil }
// readConsole reads utf16 characters from console File, // encodes them into utf8 and stores them in buffer b. // It returns the number of utf8 bytes read and an error, if any. func (f *File) readConsole(b []byte) (n int, err error) { if len(b) == 0 { return 0, nil } if len(f.readbuf) == 0 { // get more input data from os wchars := make([]uint16, len(b)) var p *uint16 if len(b) > 0 { p = &wchars[0] } var nw uint32 err := syscall.ReadConsole(f.fd, p, uint32(len(wchars)), &nw, nil) if err != nil { return 0, err } f.readbuf = utf16.Decode(wchars[:nw]) } for i, r := range f.readbuf { if utf8.RuneLen(r) > len(b) { f.readbuf = f.readbuf[i:] return n, nil } nr := utf8.EncodeRune(b, r) b = b[nr:] n += nr } f.readbuf = nil return n, nil }
func (f *Freq) getDAGbyTree(src string) (dag []*FIS, idxs []int) { dag = make([]*FIS, len(src)) idxs = make([]int, 0, len(src)+1) for i := range src { idxs = append(idxs, i) p := f.data l := FIS{make([]*FI, 0)} for j, c := range src[i:] { q, ok := p.next[c] //if ! ok { break } r := 0.0 if ok { p = q r = p.freq } if r == 0 && j == 0 { r = f.minf // always add first rune, even not found } if r > 0 { e := i + j + utf8.RuneLen(c) fi := FI{i, e, r, j == 0} l.data = append(l.data, &fi) } if !ok { break } } dag[i] = &l } idxs = append(idxs, len(src)) return }
func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { r, size := rune(0), 0 for ; nSrc < len(src); nSrc += size { r = rune(src[nSrc]) // Decode a 1-byte rune. if r < utf8.RuneSelf { size = 1 } else { // Decode a multi-byte rune. r, size = utf8.DecodeRune(src[nSrc:]) if size == 1 { // All valid runes of size 1 (those below utf8.RuneSelf) were // handled above. We have invalid UTF-8 or we haven't seen the // full character yet. if !atEOF && !utf8.FullRune(src[nSrc:]) { err = transform.ErrShortSrc break } r = '\ufffd' } } if nDst+utf8.RuneLen(r) > len(dst) { err = transform.ErrShortDst break } nDst += utf8.EncodeRune(dst[nDst:], r) } return nDst, nSrc, err }
// Map returns a copy of the byte array s with all its characters modified // according to the mapping function. If mapping returns a negative value, the character is // dropped from the string with no replacement. The characters in s and the // output are interpreted as UTF-8-encoded Unicode code points. func Map(mapping func(r rune) rune, s []byte) []byte { // In the worst case, the array can grow when mapped, making // things unpleasant. But it's so rare we barge in assuming it's // fine. It could also shrink but that falls out naturally. maxbytes := len(s) // length of b nbytes := 0 // number of bytes encoded in b b := make([]byte, maxbytes) for i := 0; i < len(s); { wid := 1 r := rune(s[i]) if r >= utf8.RuneSelf { r, wid = utf8.DecodeRune(s[i:]) } r = mapping(r) if r >= 0 { if nbytes+utf8.RuneLen(r) > maxbytes { // Grow the buffer. maxbytes = maxbytes*2 + utf8.UTFMax nb := make([]byte, maxbytes) copy(nb, b[0:nbytes]) b = nb } nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r) } i += wid } return b[0:nbytes] }
// urlEncodedName encode the strings from UTF-8 byte representations to HTML hex escape sequences // // This is necessary since regular url.Parse() and url.Encode() functions do not support UTF-8 // non english characters cannot be parsed due to the nature in which url.Encode() is written // // This function on the other hand is a direct replacement for url.Encode() technique to support // pretty much every UTF-8 character. func urlEncodeName(name string) (string, *probe.Error) { // if object matches reserved string, no need to encode them reservedNames := regexp.MustCompile("^[a-zA-Z0-9-_.~/]+$") if reservedNames.MatchString(name) { return name, nil } var encodedName string for _, s := range name { if 'A' <= s && s <= 'Z' || 'a' <= s && s <= 'z' || '0' <= s && s <= '9' { // §2.3 Unreserved characters (mark) encodedName = encodedName + string(s) continue } switch s { case '-', '_', '.', '~', '/': // §2.3 Unreserved characters (mark) encodedName = encodedName + string(s) continue default: len := utf8.RuneLen(s) if len < 0 { return "", probe.NewError(InvalidArgument{}) } u := make([]byte, len) utf8.EncodeRune(u, s) for _, r := range u { hex := hex.EncodeToString([]byte{r}) encodedName = encodedName + "%" + strings.ToUpper(hex) } } } return encodedName, nil }
// getURLEncodedName encode the strings from UTF-8 byte representations to HTML hex escape sequences // // This is necessary since regular url.Parse() and url.Encode() functions do not support UTF-8 // non english characters cannot be parsed due to the nature in which url.Encode() is written // // This function on the other hand is a direct replacement for url.Encode() technique to support // pretty much every UTF-8 character. func getURLEncodedName(name string) string { // if object matches reserved string, no need to encode them if reservedNames.MatchString(name) { return name } var encodedName string for _, s := range name { if 'A' <= s && s <= 'Z' || 'a' <= s && s <= 'z' || '0' <= s && s <= '9' { // §2.3 Unreserved characters (mark) encodedName = encodedName + string(s) continue } switch s { case '-', '_', '.', '~', '/': // §2.3 Unreserved characters (mark) encodedName = encodedName + string(s) continue default: len := utf8.RuneLen(s) if len < 0 { return name } u := make([]byte, len) utf8.EncodeRune(u, s) for _, r := range u { hex := hex.EncodeToString([]byte{r}) encodedName = encodedName + "%" + strings.ToUpper(hex) } } } return encodedName }
func (g *Group) lookupByName(name string, ini bool) (*Option, string) { name = strings.ToLower(name) if ini { if ret := g.IniNames[name]; ret != nil { return ret, ret.Field.Tag.Get("ini-name") } if ret := g.Names[name]; ret != nil { return ret, ret.Field.Name } } if ret := g.LongNames[name]; ret != nil { return ret, ret.LongName } if utf8.RuneCountInString(name) == 1 { r, _ := utf8.DecodeRuneInString(name) if ret := g.ShortNames[r]; ret != nil { data := make([]byte, utf8.RuneLen(ret.ShortName)) utf8.EncodeRune(data, ret.ShortName) return ret, string(data) } } return nil, "" }
// replace replaces each rune r of s with replacementTable[r], provided that // r < len(replacementTable). If replacementTable[r] is the empty string then // no replacement is made. // It also replaces runes U+2028 and U+2029 with the raw strings `\u2028` and // `\u2029`. func replace(s string, replacementTable []string) string { var b bytes.Buffer written := 0 for i, r := range s { var repl string switch { case int(r) < len(replacementTable) && replacementTable[r] != "": repl = replacementTable[r] case r == '\u2028': repl = `\u2028` case r == '\u2029': repl = `\u2029` default: continue } b.WriteString(s[written:i]) b.WriteString(repl) written = i + utf8.RuneLen(r) } if written == 0 { return s } b.WriteString(s[written:]) return b.String() }
func (m *Machine) MultiPatternSearch(content string, returnImmediately bool) []Term { terms := make([]Term, 0, 16) state := ROOT_STATE for pos, c := range content { start: newState := m.g(state, c) if newState == FAIL_STATE { state = m.f(state) goto start } else { state = newState if state >= len(m.output) { continue } for _, word := range m.output[state] { term := Term{ Pos: pos + utf8.RuneLen(c) - len(word), Word: word, } terms = append(terms, term) if returnImmediately { return terms } } } } return terms }