func TestCaseProperties(t *testing.T) { assigned := rangetable.Assigned(UnicodeVersion) coreVersion := rangetable.Assigned(unicode.Version) for r := rune(0); r <= lastRuneForTesting; r++ { if !unicode.In(r, assigned) || !unicode.In(r, coreVersion) { continue } c := contextFromRune(r) if got, want := c.info.isCaseIgnorable(), propIgnore(r); got != want { t.Errorf("caseIgnorable(%U): got %v; want %v (%x)", r, got, want, c.info) } // New letters may change case types, but existing case pairings should // not change. See Case Pair Stability in // http://unicode.org/policies/stability_policy.html. if rf := unicode.SimpleFold(r); rf != r && unicode.In(rf, assigned) { if got, want := c.info.isCased(), propCased(r); got != want { t.Errorf("cased(%U): got %v; want %v (%x)", r, got, want, c.info) } if got, want := c.caseType() == cUpper, propUpper(r); got != want { t.Errorf("upper(%U): got %v; want %v (%x)", r, got, want, c.info) } if got, want := c.caseType() == cLower, propLower(r); got != want { t.Errorf("lower(%U): got %v; want %v (%x)", r, got, want, c.info) } } if got, want := c.info.isBreak(), hasBreakProp(r); got != want { t.Errorf("isBreak(%U): got %v; want %v (%x)", r, got, want, c.info) } } // TODO: get title case from unicode file. }
func (n NeologdNormalizer) EliminateSpace(s string) string { var ( b bytes.Buffer prev rune ) for p := 0; p < len(s); { c, w := utf8.DecodeRuneInString(s[p:]) p += w if !unicode.IsSpace(c) { b.WriteRune(c) prev = c continue } for p < len(s) { c0, w0 := utf8.DecodeRuneInString(s[p:]) p += w0 if !unicode.IsSpace(c0) { if unicode.In(prev, unicode.Latin, latinSymbols) && unicode.In(c0, unicode.Latin, latinSymbols) { b.WriteRune(' ') } b.WriteRune(c0) prev = c0 break } } } return b.String() }
func TestCaseProperties(t *testing.T) { if unicode.Version != UnicodeVersion { t.Skipf("UnicodeVersion=%s, but unicode.Version=%s", UnicodeVersion, unicode.Version) } assigned := rangetable.Assigned(UnicodeVersion) for r := rune(0); r <= lastRuneForTesting; r++ { if !unicode.In(r, assigned) || !unicode.In(unicode.SimpleFold(r), assigned) { continue } c := contextFromRune(r) if got, want := c.info.isCaseIgnorable(), propIgnore(r); got != want { t.Errorf("caseIgnorable(%U): got %v; want %v (%x)", r, got, want, c.info) } if got, want := c.info.isCased(), propCased(r); got != want { t.Errorf("cased(%U): got %v; want %v (%x)", r, got, want, c.info) } if got, want := c.caseType() == cUpper, propUpper(r); got != want { t.Errorf("upper(%U): got %v; want %v (%x)", r, got, want, c.info) } if got, want := c.caseType() == cLower, propLower(r); got != want { t.Errorf("lower(%U): got %v; want %v (%x)", r, got, want, c.info) } if got, want := c.info.isBreak(), hasBreakProp(r); got != want { t.Errorf("isBreak(%U): got %v; want %v (%x)", r, got, want, c.info) } } // TODO: get title case from unicode file. }
func (nc *numberConverter) isDigit() bool { if nc.b != nil { r, _ := utf8.DecodeRune(nc.b) return unicode.In(r, unicode.Nd) } r, _ := utf8.DecodeRuneInString(nc.s) return unicode.In(r, unicode.Nd) }
func emphasisFringeRank(r rune) int { switch { case r == utf8.RuneError: // NOTE(akavel): not sure if that's really correct return 0 case unicode.In(r, unicode.Zs, unicode.Zl, unicode.Zp, unicode.Cc, unicode.Cf): return 0 case unicode.In(r, unicode.Pc, unicode.Pd, unicode.Ps, unicode.Pe, unicode.Pi, unicode.Pf, unicode.Po, unicode.Sc, unicode.Sk, unicode.Sm, unicode.So): return 1 default: return 2 } }
func TestNonDigits(t *testing.T) { c := collate.New(language.English, collate.Loose, collate.Numeric) // Verify that all non-digit numbers sort outside of the number range. for r, hi := rune(unicode.N.R16[0].Lo), rune(unicode.N.R32[0].Hi); r <= hi; r++ { if unicode.In(r, unicode.Nd) || !unicode.In(r, assigned) { continue } if a := string(r); c.CompareString(a, "0") != -1 && c.CompareString(a, "999999") != 1 { t.Errorf("%+q non-digit number is collated as digit", a) } } }
func TestMapping(t *testing.T) { assigned := rangetable.Assigned(UnicodeVersion) coreVersion := rangetable.Assigned(unicode.Version) if coreVersion == nil { coreVersion = assigned } apply := func(r rune, f func(c *context) bool) string { c := contextFromRune(r) f(c) return string(c.dst[:c.pDst]) } for r, tt := range special { if got, want := apply(r, lower), tt.toLower; got != want { t.Errorf("lowerSpecial:(%U): got %+q; want %+q", r, got, want) } if got, want := apply(r, title), tt.toTitle; got != want { t.Errorf("titleSpecial:(%U): got %+q; want %+q", r, got, want) } if got, want := apply(r, upper), tt.toUpper; got != want { t.Errorf("upperSpecial:(%U): got %+q; want %+q", r, got, want) } } for r := rune(0); r <= lastRuneForTesting; r++ { if !unicode.In(r, assigned) || !unicode.In(r, coreVersion) { continue } if rf := unicode.SimpleFold(r); rf == r || !unicode.In(rf, assigned) { continue } if _, ok := special[r]; ok { continue } want := string(unicode.ToLower(r)) if got := apply(r, lower); got != want { t.Errorf("lower:%q (%U): got %q %U; want %q %U", r, r, got, []rune(got), want, []rune(want)) } want = string(unicode.ToUpper(r)) if got := apply(r, upper); got != want { t.Errorf("upper:%q (%U): got %q %U; want %q %U", r, r, got, []rune(got), want, []rune(want)) } want = string(unicode.ToTitle(r)) if got := apply(r, title); got != want { t.Errorf("title:%q (%U): got %q %U; want %q %U", r, r, got, []rune(got), want, []rune(want)) } } }
func lexFixed(l *lexer) lexFn { parseFixed := func(i int) terminal { s := l.line[l.start:i] var t token for k := range l.key { if s == k { t = token{terminal: l.key[k]} } } if t.terminal == 0 { l.lexErr("Invalid symbol") } l.emit(t) return t.terminal } if unicode.In(l.cur, unicode.Nd, unicode.L, unicode.Z) || l.cur == '_' { t := parseFixed(l.pos) if !unicode.IsSpace(l.cur) { switch t { default: l.lexErr("No whitespace after symbol '" + l.line[l.start:l.pos] + "'") case tAlias: case tAutoVar: case tAssign: case tComma: case tLeftParen: } } return lexNext(l) } else if l.isLast() { parseFixed(l.pos + l.width) return nil } return lexFixed }
// LetterDigits: https://tools.ietf.org/html/rfc5892#section-2.1 // r in {Ll, Lu, Lo, Nd, Lm, Mn, Mc}. func isLetterDigits(r rune) bool { return unicode.In(r, unicode.Ll, unicode.Lu, unicode.Lm, unicode.Lo, // Letters unicode.Mn, unicode.Mc, // Modifiers unicode.Nd, // Digits ) }
func parseFields(input string) []string { fields := make([]string, 0, 10) skipSpaces := true fieldStart := -1 for i, r := range input { if unicode.In(r, quoteCharacters) { if fieldStart == -1 { // start field fieldStart = i + 1 skipSpaces = false } else { // end field fields = append(fields, input[fieldStart:i]) fieldStart = -1 skipSpaces = true } } else if unicode.IsSpace(r) && skipSpaces { // end field if not in a quoted field if fieldStart != -1 { fields = append(fields, input[fieldStart:i]) fieldStart = -1 } } else if fieldStart == -1 { // start field fieldStart = i } } if fieldStart != -1 { // end last field if it hasn't yet fields = append(fields, input[fieldStart:]) } return fields }
func getQuoteSplitter(sep rune) func(rune) bool { lastQuote := rune(0) return func(c rune) bool { switch { // quote end case c == lastQuote: lastQuote = rune(0) return false // quoted text case lastQuote != rune(0): return false // quote start case unicode.In(c, unicode.Quotation_Mark): lastQuote = c return false // unquoted text, need split default: if sep != rune(0) { return c == sep } else { // use space as default separator return unicode.IsSpace(c) } } } }
func generateGodebugIdentifiers(f *ast.File) { // Variables that won't have suffixes. idents.ctx = createConflictFreeName("ctx", f, false) idents.ok = createConflictFreeName("ok", f, false) idents.scope = createConflictFreeName("scope", f, false) idents.receiver = createConflictFreeName("receiver", f, false) idents.recoverChan = createConflictFreeName("rr", f, false) idents.recoverChanChan = createConflictFreeName("r", f, false) idents.recovers = createConflictFreeName("recovers", f, false) idents.panicVal = createConflictFreeName("v", f, false) idents.panicChan = createConflictFreeName("panicChan", f, false) idents.godebug = generateGodebugPkgName(f) // Variables that will have suffixes. idents.result = createConflictFreeName("result", f, true) idents.input = createConflictFreeName("input", f, true) // Variables with names derived from the filename. base := strings.Map(func(r rune) rune { if !unicode.In(r, unicode.Digit, unicode.Letter) { return '_' } return r }, filepath.Base(fs.Position(f.Pos()).Filename)) if !unicode.IsLetter(rune(base[0])) { // identifiers must start with letters base = "a" + base } idents.fileScope = createConflictFreeName(base+"_scope", f, false) idents.fileContents = createConflictFreeName(base+"_contents", f, false) }
// According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/) // RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic // values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C // ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER. func IsWordChar(r rune) bool { //"L", "Mn", "Nd", "Pc" return unicode.In(r, unicode.Categories["L"], unicode.Categories["Mn"], unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C' //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' }
// Ensure that ceratain properties were generated correctly. func TestTable(t *testing.T) { tests := []tableTest{ tableTest{ rangetable.Merge( unicode.Lt, unicode.Nl, unicode.No, // Other letter digits unicode.Me, // Modifiers unicode.Zs, // Spaces unicode.So, // Symbols unicode.Pi, unicode.Pf, // Punctuation ), idDisOrFreePVal, }, tableTest{ rangetable.New(0x30000, 0x30101, 0xDFFFF), unassigned, }, } assigned := rangetable.Assigned(UnicodeVersion) for _, test := range tests { rangetable.Visit(test.rangeTable, func(r rune) { if !unicode.In(r, assigned) { return } b := make([]byte, 4) n := utf8.EncodeRune(b, r) trieval, _ := dpTrie.lookup(b[:n]) p := entry(trieval).property() if p != test.prop && !exceptions.Contains(r) { t.Errorf("%U: got %+x; want %+x", r, test.prop, p) } }) } }
func BenchmarkNotMerged(t *testing.B) { for i := 0; i < t.N; i++ { for _, r := range runes { unicode.In(r, unicode.GraphicRanges...) } } }
func BenchmarkNotMergedCased(t *testing.B) { for i := 0; i < t.N; i++ { for _, r := range runes { unicode.In(r, cased...) } } }
func IsECMAWordChar(r rune) bool { return unicode.In(r, unicode.Categories["L"], unicode.Categories["Mn"], unicode.Categories["Nd"], unicode.Categories["Pc"]) //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' }
func writeTables() { propTrie := triegen.NewTrie("derivedProperties") w := gen.NewCodeWriter() defer w.WriteGoFile(*outputFile, "precis") gen.WriteUnicodeVersion(w) // Iterate over all the runes... for i := rune(0); i < unicode.MaxRune; i++ { r := rune(i) if !utf8.ValidRune(r) { continue } e, ok := exceptions[i] p := e.prop switch { case ok: case !unicode.In(r, assigned): p = unassigned case r >= 0x0021 && r <= 0x007e: // Is ASCII 7 p = pValid case unicode.In(r, disallowedRunes, unicode.Cc): p = disallowed case hasCompat(r): p = idDisOrFreePVal case isLetterDigits(r): p = pValid case isIdDisAndFreePVal(r): p = idDisOrFreePVal default: p = disallowed } cat := runeCategory[r] // Don't set category for runes that are disallowed. if p == disallowed { cat = exceptions[r].cat } propTrie.Insert(r, uint64(p)|uint64(cat)) } sz, err := propTrie.Gen(w) if err != nil { log.Fatal(err) } w.Size += sz }
func kanjiOnly(s string) bool { for _, r := range s { if !unicode.In(r, unicode.Ideographic) { return false } } return s != "" }
func isLatin(text string) bool { for _, r := range text { if !unicode.In(r, unicode.Latin) { return false } } return true }
func isAlnum(s string) bool { for _, c := range s { if !unicode.In(c, unicode.Letter, unicode.Digit) { return false } } return true }
// ContainsKatakana returns true when text contains katakana func ContainsKatakana(text string) bool { for _, r := range text { if unicode.In(r, unicode.Katakana) { return true } } return false }
func isHexDigit(r rune) bool { // is_lower_case := r == 'a' || r == 'b' || r == 'c' || r == 'd' || r == 'e' || r == 'f' // is_upper_case := r == 'A' || r == 'B' || r == 'C' || r == 'D' || r == 'E' || r == 'F' // is_digit := r == '0' || r == '1' || r == '2' || r == '3' || r == '4' || // r == '5' || r == '6' || r == '7' || r == '8' || r == '9' // return is_lower_case || is_upper_case || is_digit return unicode.In(r, unicode.Properties["ASCII_Hex_Digit"]) }
// validFirstRune returns true for runes that are valid // as the first rune in an identifier. // E.g: // 'r' -> true // '7' -> false func validFirstRune(r rune) bool { return unicode.In(r, unicode.Lu, unicode.Ll, unicode.Lm, unicode.Lo, unicode.Nl, ) || r == '$' || r == '_' }
func isIdDisAndFreePVal(r rune) bool { return unicode.In(r, unicode.Lt, unicode.Nl, unicode.No, // Other letters / numbers unicode.Me, // Modifiers unicode.Zs, // Spaces unicode.Sm, unicode.Sc, unicode.Sk, unicode.So, // Symbols unicode.Pc, unicode.Pd, unicode.Ps, unicode.Pe, unicode.Pi, unicode.Pf, unicode.Po, // Punctuation ) }
func TestFoldData(t *testing.T) { assigned := rangetable.Assigned(UnicodeVersion) coreVersion := rangetable.Assigned(unicode.Version) apply := func(r rune, f func(c *context) bool) (string, info) { c := contextFromRune(r) f(c) return string(c.dst[:c.pDst]), c.info.cccType() } for r := rune(0); r <= lastRuneForTesting; r++ { if !unicode.In(r, assigned) || !unicode.In(r, coreVersion) { continue } x := runeFoldData(r) if got, info := apply(r, foldFull); got != x.full { t.Errorf("full:%q (%U): got %q %U; want %q %U (ccc=%x)", r, r, got, []rune(got), x.full, []rune(x.full), info) } // TODO: special and simple. } }
func writeTables() { propTrie := triegen.NewTrie("derivedProperties") w := gen.NewCodeWriter() defer w.WriteGoFile(*outputFile, "precis") gen.WriteUnicodeVersion(w) // Iterate over all the runes... for i := uint32(0); i < unicode.MaxRune; i++ { r := rune(i) if !utf8.ValidRune(r) { continue } p, ok := exceptions[i] switch { case ok: case !unicode.In(r, assigned): p = unassigned case r >= 33 && r <= 126: // Is ASCII 7 p = pValid case r == 0x200C || r == 0x200D: // Is join control p = contextJ case unicode.In(r, disallowedRunes, unicode.Cc): p = disallowed case isHasCompat(r): p = idDis | freePVal case isLetterDigits(r): p = pValid case isIdDisAndFreePVal(r): p = idDis | freePVal default: p = disallowed } propTrie.Insert(r, uint64(p)) } sz, err := propTrie.Gen(w) if err != nil { log.Fatal(err) } w.Size += sz }
func gettype(c rune) rune { for _, x := range kannumTable { if x == c { return 'M' } } switch { case unicode.In(c, kanjiTable): return 'H' case unicode.In(c, hiraganaTable): return 'I' case unicode.In(c, katakanaTable): return 'K' case unicode.In(c, alphabetTable): return 'A' case unicode.In(c, numberTable): return 'N' } return 'O' }
func calcStringWidth(s string) int { width := 0 for _, c := range s { if unicode.In(c, unicode.Hangul, unicode.Katakana, unicode.Hiragana, unicode.Han) { width += 2 } else { width += 1 } } return width }
// ToHiragana converts all katakana text to hiragana. // You should normalize text before converting. func ToHiragana(text string) string { var buf bytes.Buffer for _, r := range text { if unicode.In(r, unicode.Katakana) { // Convert to hiragana r -= 0x60 } buf.WriteRune(r) } return buf.String() }