func TestCaseProperties(t *testing.T) {
	assigned := rangetable.Assigned(UnicodeVersion)
	coreVersion := rangetable.Assigned(unicode.Version)
	for r := rune(0); r <= lastRuneForTesting; r++ {
		if !unicode.In(r, assigned) || !unicode.In(r, coreVersion) {
			continue
		}
		c := contextFromRune(r)
		if got, want := c.info.isCaseIgnorable(), propIgnore(r); got != want {
			t.Errorf("caseIgnorable(%U): got %v; want %v (%x)", r, got, want, c.info)
		}
		// New letters may change case types, but existing case pairings should
		// not change. See Case Pair Stability in
		// http://unicode.org/policies/stability_policy.html.
		if rf := unicode.SimpleFold(r); rf != r && unicode.In(rf, assigned) {
			if got, want := c.info.isCased(), propCased(r); got != want {
				t.Errorf("cased(%U): got %v; want %v (%x)", r, got, want, c.info)
			}
			if got, want := c.caseType() == cUpper, propUpper(r); got != want {
				t.Errorf("upper(%U): got %v; want %v (%x)", r, got, want, c.info)
			}
			if got, want := c.caseType() == cLower, propLower(r); got != want {
				t.Errorf("lower(%U): got %v; want %v (%x)", r, got, want, c.info)
			}
		}
		if got, want := c.info.isBreak(), hasBreakProp(r); got != want {
			t.Errorf("isBreak(%U): got %v; want %v (%x)", r, got, want, c.info)
		}
	}
	// TODO: get title case from unicode file.
}
Example #2
0
File: neologd.go Project: ikawaha/x
func (n NeologdNormalizer) EliminateSpace(s string) string {
	var (
		b    bytes.Buffer
		prev rune
	)
	for p := 0; p < len(s); {
		c, w := utf8.DecodeRuneInString(s[p:])
		p += w
		if !unicode.IsSpace(c) {
			b.WriteRune(c)
			prev = c
			continue
		}
		for p < len(s) {
			c0, w0 := utf8.DecodeRuneInString(s[p:])
			p += w0
			if !unicode.IsSpace(c0) {
				if unicode.In(prev, unicode.Latin, latinSymbols) &&
					unicode.In(c0, unicode.Latin, latinSymbols) {
					b.WriteRune(' ')
				}
				b.WriteRune(c0)
				prev = c0
				break
			}
		}

	}
	return b.String()
}
Example #3
0
func TestCaseProperties(t *testing.T) {
	if unicode.Version != UnicodeVersion {
		t.Skipf("UnicodeVersion=%s, but unicode.Version=%s", UnicodeVersion, unicode.Version)
	}
	assigned := rangetable.Assigned(UnicodeVersion)
	for r := rune(0); r <= lastRuneForTesting; r++ {
		if !unicode.In(r, assigned) || !unicode.In(unicode.SimpleFold(r), assigned) {
			continue
		}
		c := contextFromRune(r)
		if got, want := c.info.isCaseIgnorable(), propIgnore(r); got != want {
			t.Errorf("caseIgnorable(%U): got %v; want %v (%x)", r, got, want, c.info)
		}
		if got, want := c.info.isCased(), propCased(r); got != want {
			t.Errorf("cased(%U): got %v; want %v (%x)", r, got, want, c.info)
		}
		if got, want := c.caseType() == cUpper, propUpper(r); got != want {
			t.Errorf("upper(%U): got %v; want %v (%x)", r, got, want, c.info)
		}
		if got, want := c.caseType() == cLower, propLower(r); got != want {
			t.Errorf("lower(%U): got %v; want %v (%x)", r, got, want, c.info)
		}
		if got, want := c.info.isBreak(), hasBreakProp(r); got != want {
			t.Errorf("isBreak(%U): got %v; want %v (%x)", r, got, want, c.info)
		}
	}
	// TODO: get title case from unicode file.
}
Example #4
0
func (nc *numberConverter) isDigit() bool {
	if nc.b != nil {
		r, _ := utf8.DecodeRune(nc.b)
		return unicode.In(r, unicode.Nd)
	}
	r, _ := utf8.DecodeRuneInString(nc.s)
	return unicode.In(r, unicode.Nd)
}
Example #5
0
func emphasisFringeRank(r rune) int {
	switch {
	case r == utf8.RuneError:
		// NOTE(akavel): not sure if that's really correct
		return 0
	case unicode.In(r, unicode.Zs, unicode.Zl, unicode.Zp, unicode.Cc, unicode.Cf):
		return 0
	case unicode.In(r, unicode.Pc, unicode.Pd, unicode.Ps, unicode.Pe, unicode.Pi, unicode.Pf, unicode.Po, unicode.Sc, unicode.Sk, unicode.Sm, unicode.So):
		return 1
	default:
		return 2
	}
}
Example #6
0
func TestNonDigits(t *testing.T) {
	c := collate.New(language.English, collate.Loose, collate.Numeric)

	// Verify that all non-digit numbers sort outside of the number range.
	for r, hi := rune(unicode.N.R16[0].Lo), rune(unicode.N.R32[0].Hi); r <= hi; r++ {
		if unicode.In(r, unicode.Nd) || !unicode.In(r, assigned) {
			continue
		}
		if a := string(r); c.CompareString(a, "0") != -1 && c.CompareString(a, "999999") != 1 {
			t.Errorf("%+q non-digit number is collated as digit", a)
		}
	}
}
Example #7
0
func TestMapping(t *testing.T) {
	assigned := rangetable.Assigned(UnicodeVersion)
	coreVersion := rangetable.Assigned(unicode.Version)
	if coreVersion == nil {
		coreVersion = assigned
	}
	apply := func(r rune, f func(c *context) bool) string {
		c := contextFromRune(r)
		f(c)
		return string(c.dst[:c.pDst])
	}

	for r, tt := range special {
		if got, want := apply(r, lower), tt.toLower; got != want {
			t.Errorf("lowerSpecial:(%U): got %+q; want %+q", r, got, want)
		}
		if got, want := apply(r, title), tt.toTitle; got != want {
			t.Errorf("titleSpecial:(%U): got %+q; want %+q", r, got, want)
		}
		if got, want := apply(r, upper), tt.toUpper; got != want {
			t.Errorf("upperSpecial:(%U): got %+q; want %+q", r, got, want)
		}
	}

	for r := rune(0); r <= lastRuneForTesting; r++ {
		if !unicode.In(r, assigned) || !unicode.In(r, coreVersion) {
			continue
		}
		if rf := unicode.SimpleFold(r); rf == r || !unicode.In(rf, assigned) {
			continue
		}
		if _, ok := special[r]; ok {
			continue
		}
		want := string(unicode.ToLower(r))
		if got := apply(r, lower); got != want {
			t.Errorf("lower:%q (%U): got %q %U; want %q %U", r, r, got, []rune(got), want, []rune(want))
		}

		want = string(unicode.ToUpper(r))
		if got := apply(r, upper); got != want {
			t.Errorf("upper:%q (%U): got %q %U; want %q %U", r, r, got, []rune(got), want, []rune(want))
		}

		want = string(unicode.ToTitle(r))
		if got := apply(r, title); got != want {
			t.Errorf("title:%q (%U): got %q %U; want %q %U", r, r, got, []rune(got), want, []rune(want))
		}
	}
}
Example #8
0
File: lex.go Project: vvanpo/system
func lexFixed(l *lexer) lexFn {
	parseFixed := func(i int) terminal {
		s := l.line[l.start:i]
		var t token
		for k := range l.key {
			if s == k {
				t = token{terminal: l.key[k]}
			}
		}
		if t.terminal == 0 {
			l.lexErr("Invalid symbol")
		}
		l.emit(t)
		return t.terminal
	}
	if unicode.In(l.cur, unicode.Nd, unicode.L, unicode.Z) || l.cur == '_' {
		t := parseFixed(l.pos)
		if !unicode.IsSpace(l.cur) {
			switch t {
			default:
				l.lexErr("No whitespace after symbol '" + l.line[l.start:l.pos] + "'")
			case tAlias:
			case tAutoVar:
			case tAssign:
			case tComma:
			case tLeftParen:
			}
		}
		return lexNext(l)
	} else if l.isLast() {
		parseFixed(l.pos + l.width)
		return nil
	}
	return lexFixed
}
Example #9
0
// LetterDigits: https://tools.ietf.org/html/rfc5892#section-2.1
// r in {Ll, Lu, Lo, Nd, Lm, Mn, Mc}.
func isLetterDigits(r rune) bool {
	return unicode.In(r,
		unicode.Ll, unicode.Lu, unicode.Lm, unicode.Lo, // Letters
		unicode.Mn, unicode.Mc, // Modifiers
		unicode.Nd, // Digits
	)
}
Example #10
0
func parseFields(input string) []string {
	fields := make([]string, 0, 10)
	skipSpaces := true
	fieldStart := -1

	for i, r := range input {
		if unicode.In(r, quoteCharacters) {
			if fieldStart == -1 {
				// start field
				fieldStart = i + 1
				skipSpaces = false
			} else {
				// end field
				fields = append(fields, input[fieldStart:i])
				fieldStart = -1
				skipSpaces = true
			}
		} else if unicode.IsSpace(r) && skipSpaces {
			// end field if not in a quoted field
			if fieldStart != -1 {
				fields = append(fields, input[fieldStart:i])
				fieldStart = -1
			}
		} else if fieldStart == -1 {
			// start field
			fieldStart = i
		}
	}
	if fieldStart != -1 {
		// end last field if it hasn't yet
		fields = append(fields, input[fieldStart:])
	}
	return fields
}
Example #11
0
File: lib.go Project: Fiery/go-run
func getQuoteSplitter(sep rune) func(rune) bool {
	lastQuote := rune(0)
	return func(c rune) bool {
		switch {
		// quote end
		case c == lastQuote:
			lastQuote = rune(0)
			return false
		// quoted text
		case lastQuote != rune(0):
			return false
		// quote start
		case unicode.In(c, unicode.Quotation_Mark):
			lastQuote = c
			return false
		// unquoted text, need split
		default:
			if sep != rune(0) {
				return c == sep
			} else {
				// use space as default separator
				return unicode.IsSpace(c)
			}

		}
	}
}
Example #12
0
func generateGodebugIdentifiers(f *ast.File) {
	// Variables that won't have suffixes.
	idents.ctx = createConflictFreeName("ctx", f, false)
	idents.ok = createConflictFreeName("ok", f, false)
	idents.scope = createConflictFreeName("scope", f, false)
	idents.receiver = createConflictFreeName("receiver", f, false)
	idents.recoverChan = createConflictFreeName("rr", f, false)
	idents.recoverChanChan = createConflictFreeName("r", f, false)
	idents.recovers = createConflictFreeName("recovers", f, false)
	idents.panicVal = createConflictFreeName("v", f, false)
	idents.panicChan = createConflictFreeName("panicChan", f, false)

	idents.godebug = generateGodebugPkgName(f)

	// Variables that will have suffixes.
	idents.result = createConflictFreeName("result", f, true)
	idents.input = createConflictFreeName("input", f, true)

	// Variables with names derived from the filename.
	base := strings.Map(func(r rune) rune {
		if !unicode.In(r, unicode.Digit, unicode.Letter) {
			return '_'
		}
		return r
	}, filepath.Base(fs.Position(f.Pos()).Filename))
	if !unicode.IsLetter(rune(base[0])) {
		// identifiers must start with letters
		base = "a" + base
	}
	idents.fileScope = createConflictFreeName(base+"_scope", f, false)
	idents.fileContents = createConflictFreeName(base+"_contents", f, false)
}
Example #13
0
// According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
// RL 1.4 Simple Word Boundaries  The class of <word_character> includes all Alphabetic
// values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
// ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
func IsWordChar(r rune) bool {
	//"L", "Mn", "Nd", "Pc"
	return unicode.In(r,
		unicode.Categories["L"], unicode.Categories["Mn"],
		unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C'
	//return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
}
Example #14
0
// Ensure that ceratain properties were generated correctly.
func TestTable(t *testing.T) {
	tests := []tableTest{
		tableTest{
			rangetable.Merge(
				unicode.Lt, unicode.Nl, unicode.No, // Other letter digits
				unicode.Me,             // Modifiers
				unicode.Zs,             // Spaces
				unicode.So,             // Symbols
				unicode.Pi, unicode.Pf, // Punctuation
			),
			idDisOrFreePVal,
		},
		tableTest{
			rangetable.New(0x30000, 0x30101, 0xDFFFF),
			unassigned,
		},
	}

	assigned := rangetable.Assigned(UnicodeVersion)

	for _, test := range tests {
		rangetable.Visit(test.rangeTable, func(r rune) {
			if !unicode.In(r, assigned) {
				return
			}
			b := make([]byte, 4)
			n := utf8.EncodeRune(b, r)
			trieval, _ := dpTrie.lookup(b[:n])
			p := entry(trieval).property()
			if p != test.prop && !exceptions.Contains(r) {
				t.Errorf("%U: got %+x; want %+x", r, test.prop, p)
			}
		})
	}
}
func BenchmarkNotMerged(t *testing.B) {
	for i := 0; i < t.N; i++ {
		for _, r := range runes {
			unicode.In(r, unicode.GraphicRanges...)
		}
	}
}
func BenchmarkNotMergedCased(t *testing.B) {
	for i := 0; i < t.N; i++ {
		for _, r := range runes {
			unicode.In(r, cased...)
		}
	}
}
Example #17
0
func IsECMAWordChar(r rune) bool {
	return unicode.In(r,
		unicode.Categories["L"], unicode.Categories["Mn"],
		unicode.Categories["Nd"], unicode.Categories["Pc"])

	//return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
}
Example #18
0
func writeTables() {
	propTrie := triegen.NewTrie("derivedProperties")
	w := gen.NewCodeWriter()
	defer w.WriteGoFile(*outputFile, "precis")
	gen.WriteUnicodeVersion(w)

	// Iterate over all the runes...
	for i := rune(0); i < unicode.MaxRune; i++ {
		r := rune(i)

		if !utf8.ValidRune(r) {
			continue
		}

		e, ok := exceptions[i]
		p := e.prop
		switch {
		case ok:
		case !unicode.In(r, assigned):
			p = unassigned
		case r >= 0x0021 && r <= 0x007e: // Is ASCII 7
			p = pValid
		case unicode.In(r, disallowedRunes, unicode.Cc):
			p = disallowed
		case hasCompat(r):
			p = idDisOrFreePVal
		case isLetterDigits(r):
			p = pValid
		case isIdDisAndFreePVal(r):
			p = idDisOrFreePVal
		default:
			p = disallowed
		}
		cat := runeCategory[r]
		// Don't set category for runes that are disallowed.
		if p == disallowed {
			cat = exceptions[r].cat
		}
		propTrie.Insert(r, uint64(p)|uint64(cat))
	}
	sz, err := propTrie.Gen(w)
	if err != nil {
		log.Fatal(err)
	}
	w.Size += sz
}
Example #19
0
func kanjiOnly(s string) bool {
	for _, r := range s {
		if !unicode.In(r, unicode.Ideographic) {
			return false
		}
	}
	return s != ""
}
Example #20
0
func isLatin(text string) bool {
	for _, r := range text {
		if !unicode.In(r, unicode.Latin) {
			return false
		}
	}
	return true
}
Example #21
0
func isAlnum(s string) bool {
	for _, c := range s {
		if !unicode.In(c, unicode.Letter, unicode.Digit) {
			return false
		}
	}
	return true
}
Example #22
0
// ContainsKatakana returns true when text contains katakana
func ContainsKatakana(text string) bool {
	for _, r := range text {
		if unicode.In(r, unicode.Katakana) {
			return true
		}
	}
	return false
}
Example #23
0
func isHexDigit(r rune) bool {
	// is_lower_case := r == 'a' || r == 'b' || r == 'c' || r == 'd' || r == 'e' || r == 'f'
	// is_upper_case :=  r == 'A' || r == 'B' || r == 'C' || r == 'D' || r == 'E' || r == 'F'
	// is_digit := r == '0' || r == '1' || r == '2' || r == '3' || r == '4' ||
	// 			r == '5' || r == '6' || r == '7' || r == '8' || r == '9'

	// return is_lower_case || is_upper_case || is_digit
	return unicode.In(r, unicode.Properties["ASCII_Hex_Digit"])
}
Example #24
0
// validFirstRune returns true for runes that are valid
// as the first rune in an identifier.
// E.g:
//     'r' -> true
//     '7' -> false
func validFirstRune(r rune) bool {
	return unicode.In(r,
		unicode.Lu,
		unicode.Ll,
		unicode.Lm,
		unicode.Lo,
		unicode.Nl,
	) || r == '$' || r == '_'
}
Example #25
0
func isIdDisAndFreePVal(r rune) bool {
	return unicode.In(r,
		unicode.Lt, unicode.Nl, unicode.No, // Other letters / numbers
		unicode.Me,                                     // Modifiers
		unicode.Zs,                                     // Spaces
		unicode.Sm, unicode.Sc, unicode.Sk, unicode.So, // Symbols
		unicode.Pc, unicode.Pd, unicode.Ps, unicode.Pe,
		unicode.Pi, unicode.Pf, unicode.Po, // Punctuation
	)
}
func TestFoldData(t *testing.T) {
	assigned := rangetable.Assigned(UnicodeVersion)
	coreVersion := rangetable.Assigned(unicode.Version)
	apply := func(r rune, f func(c *context) bool) (string, info) {
		c := contextFromRune(r)
		f(c)
		return string(c.dst[:c.pDst]), c.info.cccType()
	}
	for r := rune(0); r <= lastRuneForTesting; r++ {
		if !unicode.In(r, assigned) || !unicode.In(r, coreVersion) {
			continue
		}
		x := runeFoldData(r)
		if got, info := apply(r, foldFull); got != x.full {
			t.Errorf("full:%q (%U): got %q %U; want %q %U (ccc=%x)", r, r, got, []rune(got), x.full, []rune(x.full), info)
		}
		// TODO: special and simple.
	}
}
Example #27
0
func writeTables() {
	propTrie := triegen.NewTrie("derivedProperties")
	w := gen.NewCodeWriter()
	defer w.WriteGoFile(*outputFile, "precis")
	gen.WriteUnicodeVersion(w)

	// Iterate over all the runes...
	for i := uint32(0); i < unicode.MaxRune; i++ {
		r := rune(i)

		if !utf8.ValidRune(r) {
			continue
		}

		p, ok := exceptions[i]
		switch {
		case ok:
		case !unicode.In(r, assigned):
			p = unassigned
		case r >= 33 && r <= 126: // Is ASCII 7
			p = pValid
		case r == 0x200C || r == 0x200D: // Is join control
			p = contextJ
		case unicode.In(r, disallowedRunes, unicode.Cc):
			p = disallowed
		case isHasCompat(r):
			p = idDis | freePVal
		case isLetterDigits(r):
			p = pValid
		case isIdDisAndFreePVal(r):
			p = idDis | freePVal
		default:
			p = disallowed
		}
		propTrie.Insert(r, uint64(p))
	}
	sz, err := propTrie.Gen(w)
	if err != nil {
		log.Fatal(err)
	}
	w.Size += sz
}
Example #28
0
func gettype(c rune) rune {
	for _, x := range kannumTable {
		if x == c {
			return 'M'
		}
	}
	switch {
	case unicode.In(c, kanjiTable):
		return 'H'
	case unicode.In(c, hiraganaTable):
		return 'I'
	case unicode.In(c, katakanaTable):
		return 'K'
	case unicode.In(c, alphabetTable):
		return 'A'
	case unicode.In(c, numberTable):
		return 'N'
	}
	return 'O'
}
Example #29
0
func calcStringWidth(s string) int {
	width := 0
	for _, c := range s {
		if unicode.In(c, unicode.Hangul, unicode.Katakana, unicode.Hiragana, unicode.Han) {
			width += 2
		} else {
			width += 1
		}
	}
	return width
}
Example #30
0
// ToHiragana converts all katakana text to hiragana.
// You should normalize text before converting.
func ToHiragana(text string) string {
	var buf bytes.Buffer
	for _, r := range text {
		if unicode.In(r, unicode.Katakana) {
			// Convert to hiragana
			r -= 0x60
		}
		buf.WriteRune(r)
	}
	return buf.String()
}