// getWidthData calls f for every entry for which it is defined. // // f may be called multiple times for the same rune. The last call to f is the // correct value. f is not called for all runes. The default tag type is // Neutral. func getWidthData(f func(r rune, tag elem, alt rune)) { // Set the default values for Unified Ideographs. In line with Annex 11, // we encode full ranges instead of the defined runes in Unified_Ideograph. for _, b := range []struct{ lo, hi rune }{ {0x4E00, 0x9FFF}, // the CJK Unified Ideographs block, {0x3400, 0x4DBF}, // the CJK Unified Ideographs Externsion A block, {0xF900, 0xFAFF}, // the CJK Compatibility Ideographs block, {0x20000, 0x2FFFF}, // the Supplementary Ideographic Plane, {0x30000, 0x3FFFF}, // the Tertiary Ideographic Plane, } { for r := b.lo; r <= b.hi; r++ { f(r, tagWide, 0) } } inverse := map[rune]rune{} maps := map[string]bool{ "<wide>": true, "<narrow>": true, } // We cannot reuse package norm's decomposition, as we need an unexpanded // decomposition. We make use of the opportunity to verify that the // decomposition type is as expected. ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { r := p.Rune(0) s := strings.SplitN(p.String(ucd.DecompMapping), " ", 2) if !maps[s[0]] { return } x, err := strconv.ParseUint(s[1], 16, 32) if err != nil { log.Fatalf("Error parsing rune %q", s[1]) } if inverse[r] != 0 || inverse[rune(x)] != 0 { log.Fatalf("Circular dependency in mapping between %U and %U", r, x) } inverse[r] = rune(x) inverse[rune(x)] = r }) // <rune range>;<type> ucd.Parse(gen.OpenUCDFile("EastAsianWidth.txt"), func(p *ucd.Parser) { tag, ok := typeMap[p.String(1)] if !ok { log.Fatalf("Unknown width type %q", p.String(1)) } r := p.Rune(0) alt, ok := inverse[r] if tag == tagFullwidth || tag == tagHalfwidth && r != wonSign { tag |= tagNeedsFold if !ok { log.Fatalf("Narrow or wide rune %U has no decomposition", r) } } f(r, tag, alt) }) }
func TestName(t *testing.T) { testtext.SkipIfNotLong(t) wants := make([]string, 1+unicode.MaxRune) ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { r, s := p.Rune(0), p.String(ucd.Name) if s == "" { return } if s[0] == '<' { const first = ", First>" if i := strings.Index(s, first); i >= 0 { s = s[:i] + ">" } } wants[r] = s }) nErrors := 0 for r, want := range wants { got := Name(rune(r)) if got != want { t.Errorf("r=%#08x: got %q, want %q", r, got, want) nErrors++ if nErrors == 100 { t.Fatal("too many errors") } } } }
// These tables are hand-extracted from: // http://www.unicode.org/Public/8.0.0/ucd/extracted/DerivedBidiClass.txt func visitDefaults(fn func(r rune, c Class)) { // first write default values for ranges listed above. visitRunes(fn, AL, []rune{ 0x0600, 0x07BF, // Arabic 0x08A0, 0x08FF, // Arabic Extended-A 0xFB50, 0xFDCF, // Arabic Presentation Forms 0xFDF0, 0xFDFF, 0xFE70, 0xFEFF, 0x0001EE00, 0x0001EEFF, // Arabic Mathematical Alpha Symbols }) visitRunes(fn, R, []rune{ 0x0590, 0x05FF, // Hebrew 0x07C0, 0x089F, // Nko et al. 0xFB1D, 0xFB4F, 0x00010800, 0x00010FFF, // Cypriot Syllabary et. al. 0x0001E800, 0x0001EDFF, 0x0001EF00, 0x0001EFFF, }) visitRunes(fn, ET, []rune{ // European Terminator 0x20A0, 0x20Cf, // Currency symbols }) rangetable.Visit(unicode.Noncharacter_Code_Point, func(r rune) { fn(r, BN) // Boundary Neutral }) ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) { if p.String(1) == "Default_Ignorable_Code_Point" { fn(p.Rune(0), BN) // Boundary Neutral } }) }
// TestBidiCore performs the tests in BidiTest.txt. // See http://www.unicode.org/Public/UCD/latest/ucd/BidiTest.txt. func TestBidiCore(t *testing.T) { testtext.SkipIfNotLong(t) r := gen.OpenUCDFile("BidiTest.txt") defer r.Close() var wantLevels, wantOrder []string p := ucd.New(r, ucd.Part(func(p *ucd.Parser) { s := strings.Split(p.String(0), ":") switch s[0] { case "Levels": wantLevels = strings.Fields(s[1]) case "Reorder": wantOrder = strings.Fields(s[1]) default: log.Fatalf("Unknown part %q.", s[0]) } })) for p.Next() { types := []class{} for _, s := range p.Strings(0) { types = append(types, bidiClass[s]) } // We ignore the bracketing part of the algorithm. pairTypes := make([]bracketType, len(types)) pairValues := make([]rune, len(types)) for i := uint(0); i < 3; i++ { if p.Uint(1)&(1<<i) == 0 { continue } lev := level(int(i) - 1) par := newParagraph(types, pairTypes, pairValues, lev) if *testLevels { levels := par.resultLevels for i, s := range wantLevels { if s == "x" { continue } l, _ := strconv.ParseUint(s, 10, 8) if level(l)&1 != levels[i]&1 { t.Errorf("%s:%d:levels: got %v; want %v", p.String(0), lev, levels, wantLevels) break } } } order := par.getReordering([]int{len(types)}) gotOrder := filterOrder(types, order) if got, want := fmt.Sprint(gotOrder), fmt.Sprint(wantOrder); got != want { t.Errorf("%s:%d:order: got %v; want %v\noriginal %v", p.String(0), lev, got, want, order) } } } if err := p.Err(); err != nil { log.Fatal(err) } }
func parse(path string, f func(p *ucd.Parser)) { r := gen.OpenUCDFile(path) defer r.Close() p := ucd.New(r) for p.Next() { f(p) } if err := p.Err(); err != nil { log.Fatal(err) } }
// parse calls f for each entry in the given UCD file. func (opts ucdParser) parse(filename string, f func(p *ucd.Parser)) { r := gen.OpenUCDFile(filename) defer r.Close() p := ucd.New(r, opts...) for p.Next() { f(p) } if err := p.Err(); err != nil { log.Fatal(err) } }
func TestTables(t *testing.T) { testtext.SkipIfNotLong(t) ucd.Parse(gen.OpenUCDFile("BidiBrackets.txt"), func(p *ucd.Parser) { r1 := p.Rune(0) want := p.Rune(1) e, _ := LookupRune(r1) if got := e.reverseBracket(r1); got != want { t.Errorf("Reverse(%U) = %U; want %U", r1, got, want) } }) done := map[rune]bool{} test := func(name string, r rune, want string) { str := string(r) e, _ := LookupString(str) if got := labels[e.Class()]; got != want { t.Errorf("%s:%U: got %s; want %s", name, r, got, want) } if e2, sz := LookupRune(r); e != e2 || sz != len(str) { t.Errorf("LookupRune(%U) = %v, %d; want %v, %d", r, e2, e, sz, len(str)) } if e2, sz := Lookup([]byte(str)); e != e2 || sz != len(str) { t.Errorf("Lookup(%U) = %v, %d; want %v, %d", r, e2, e, sz, len(str)) } done[r] = true } // Insert the derived BiDi properties. ucd.Parse(gen.OpenUCDFile("extracted/DerivedBidiClass.txt"), func(p *ucd.Parser) { r := p.Rune(0) test("derived", r, p.String(1)) }) visitDefaults(func(r rune, c Class) { if !done[r] { test("default", r, labels[c]) } }) }
// Use values in DerivedNormalizationProps.txt to compare against the // values we computed. // DerivedNormalizationProps.txt has form: // 00C0..00C5 ; NFD_QC; N # ... // 0374 ; NFD_QC; N # ... // See http://unicode.org/reports/tr44/ for full explanation func testDerived() { f := gen.OpenUCDFile("DerivedNormalizationProps.txt") defer f.Close() p := ucd.New(f) for p.Next() { r := p.Rune(0) c := &chars[r] var ftype, mode int qt := p.String(1) switch qt { case "NFC_QC": ftype, mode = FCanonical, MComposed case "NFD_QC": ftype, mode = FCanonical, MDecomposed case "NFKC_QC": ftype, mode = FCompatibility, MComposed case "NFKD_QC": ftype, mode = FCompatibility, MDecomposed default: continue } var qr QCResult switch p.String(2) { case "Y": qr = QCYes case "N": qr = QCNo case "M": qr = QCMaybe default: log.Fatalf(`Unexpected quick check value "%s"`, p.String(2)) } if got := c.forms[ftype].quickCheck[mode]; got != qr { log.Printf("%U: FAILED %s (was %v need %v)\n", r, qt, got, qr) } c.forms[ftype].verified[mode] = true } if err := p.Err(); err != nil { log.Fatal(err) } // Any unspecified value must be QCYes. Verify this. for i, c := range chars { for j, fd := range c.forms { for k, qr := range fd.quickCheck { if !fd.verified[k] && qr != QCYes { m := "%U: FAIL F:%d M:%d (was %v need Yes) %s\n" log.Printf(m, i, j, k, qr, c.name) } } } } }
func TestTables(t *testing.T) { if !*long { return } gen.Init() trie := newBidiTrie(0) ucd.Parse(gen.OpenUCDFile("BidiBrackets.txt"), func(p *ucd.Parser) { r1 := p.Rune(0) want := p.Rune(1) e, _ := trie.lookupString(string(r1)) if got := entry(e).reverseBracket(r1); got != want { t.Errorf("Reverse(%U) = %U; want %U", r1, got, want) } }) done := map[rune]bool{} test := func(name string, r rune, want string) { e, _ := trie.lookupString(string(r)) if got := labels[entry(e).class(r)]; got != want { t.Errorf("%s:%U: got %s; want %s", name, r, got, want) } done[r] = true } // Insert the derived BiDi properties. ucd.Parse(gen.OpenUCDFile("extracted/DerivedBidiClass.txt"), func(p *ucd.Parser) { r := p.Rune(0) test("derived", r, p.String(1)) }) visitDefaults(func(r rune, c class) { if !done[r] { test("default", r, labels[c]) } }) }
func main() { gen.Init() // Load data runes := []rune{} ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) { if p.String(1) == "Default_Ignorable_Code_Point" { runes = append(runes, p.Rune(0)) } }) ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) { if p.String(1) == "LVT" { runes = append(runes, p.Rune(0)) } }) disallowedRunes = rangetable.New(runes...) assigned = rangetable.Assigned(unicode.Version) writeTables() gen.Repackage("gen_trieval.go", "trieval.go", "precis") }
// Load the data form NormalizationTest.txt func loadTestData(t *testing.T) { f := gen.OpenUCDFile("NormalizationTest.txt") defer f.Close() scanner := bufio.NewScanner(f) for scanner.Scan() { line := scanner.Text() if len(line) == 0 || line[0] == '#' { continue } m := partRe.FindStringSubmatch(line) if m != nil { if len(m) < 3 { t.Fatal("Failed to parse Part: ", line) } i, err := strconv.Atoi(m[1]) if err != nil { t.Fatal(err) } name := m[2] part = append(part, Part{name: name[:len(name)-1], number: i}) continue } m = testRe.FindStringSubmatch(line) if m == nil || len(m) < 7 { t.Fatalf(`Failed to parse: "%s" result: %#v`, line, m) } test := Test{name: m[6], partnr: len(part) - 1, number: counter} counter++ for j := 1; j < len(m)-1; j++ { for _, split := range strings.Split(m[j], " ") { r, err := strconv.ParseUint(split, 16, 64) if err != nil { t.Fatal(err) } if test.r == 0 { // save for CharacterByCharacterTests test.r = rune(r) } var buf [utf8.UTFMax]byte sz := utf8.EncodeRune(buf[:], rune(r)) test.cols[j-1] += string(buf[:sz]) } } part := &part[len(part)-1] part.tests = append(part.tests, test) } if scanner.Err() != nil { t.Fatal(scanner.Err()) } }
// CompositionExclusions.txt has form: // 0958 # ... // See http://unicode.org/reports/tr44/ for full explanation func loadCompositionExclusions() { f := gen.OpenUCDFile("CompositionExclusions.txt") defer f.Close() p := ucd.New(f) for p.Next() { c := &chars[p.Rune(0)] if c.excludeInComp { log.Fatalf("%U: Duplicate entry in exclusions.", c.codePoint) } c.excludeInComp = true } if e := p.Err(); e != nil { log.Fatal(e) } }
func parse() (names []string, counts map[string]int) { names = make([]string, 1+unicode.MaxRune) counts = map[string]int{} ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { r, s := p.Rune(0), p.String(ucd.Name) if s == "" { return } if s[0] == '<' { const first = ", First>" if i := strings.Index(s, first); i >= 0 { s = s[:i] + ">" } } names[r] = s counts[s]++ }) return names, counts }
func loadUnicodeData() { f := gen.OpenUCDFile("UnicodeData.txt") defer f.Close() p := ucd.New(f) for p.Next() { r := p.Rune(ucd.CodePoint) char := &chars[r] char.ccc = uint8(p.Uint(ucd.CanonicalCombiningClass)) decmap := p.String(ucd.DecompMapping) exp, err := parseDecomposition(decmap, false) isCompat := false if err != nil { if len(decmap) > 0 { exp, err = parseDecomposition(decmap, true) if err != nil { log.Fatalf(`%U: bad decomp |%v|: "%s"`, r, decmap, err) } isCompat = true } } char.name = p.String(ucd.Name) char.codePoint = r char.forms[FCompatibility].decomp = exp if !isCompat { char.forms[FCanonical].decomp = exp } else { char.compatDecomp = true } if len(decmap) > 0 { char.forms[FCompatibility].decomp = exp } } if err := p.Err(); err != nil { log.Fatal(err) } }
// TestBidiCharacters performs the tests in BidiCharacterTest.txt. // See http://www.unicode.org/Public/UCD/latest/ucd/BidiCharacterTest.txt func TestBidiCharacters(t *testing.T) { testtext.SkipIfNotLong(t) ucd.Parse(gen.OpenUCDFile("BidiCharacterTest.txt"), func(p *ucd.Parser) { var ( types []class pairTypes []bracketType pairValues []rune parLevel level wantLevel = level(p.Int(2)) wantLevels = p.Strings(3) wantVisualOrder = p.Strings(4) ) switch l := p.Int(1); l { case 0, 1: parLevel = level(l) case 2: parLevel = implicitLevel default: // Spec says to ignore unknown parts. } trie := newBidiTrie(0) runes := p.Runes(0) for _, r := range runes { // Assign the bracket type. if d := norm.NFKD.PropertiesString(string(r)).Decomposition(); d != nil { r = []rune(string(d))[0] } e, _ := trie.lookupString(string(r)) entry := entry(e) // Assign the class for this rune. types = append(types, entry.class(r)) switch { case !entry.isBracket(): pairTypes = append(pairTypes, bpNone) pairValues = append(pairValues, 0) case entry.isOpen(): pairTypes = append(pairTypes, bpOpen) pairValues = append(pairValues, r) default: pairTypes = append(pairTypes, bpClose) pairValues = append(pairValues, entry.reverseBracket(r)) } } par := newParagraph(types, pairTypes, pairValues, parLevel) // Test results: if got := par.embeddingLevel; got != wantLevel { t.Errorf("%v:level: got %d; want %d", string(runes), got, wantLevel) } if *testLevels { gotLevels := getLevelStrings(types, par.resultLevels) if got, want := fmt.Sprint(gotLevels), fmt.Sprint(wantLevels); got != want { t.Errorf("%04X %q:%d: got %v; want %v\nval: %x\npair: %v", runes, string(runes), parLevel, got, want, pairValues, pairTypes) } } order := par.getReordering([]int{len(types)}) order = filterOrder(types, order) if got, want := fmt.Sprint(order), fmt.Sprint(wantVisualOrder); got != want { t.Errorf("%04X %q:%d: got %v; want %v\ngot order: %s", runes, string(runes), parLevel, got, want, reorder(runes, order)) } }) }
func genTables() { if numClass > 0x0F { log.Fatalf("Too many Class constants (%#x > 0x0F).", numClass) } w := gen.NewCodeWriter() defer w.WriteGoFile(*outputFile, "bidi") gen.WriteUnicodeVersion(w) t := triegen.NewTrie("bidi") // Build data about bracket mapping. These bits need to be or-ed with // any other bits. orMask := map[rune]uint64{} xorMap := map[rune]int{} xorMasks := []rune{0} // First value is no-op. ucd.Parse(gen.OpenUCDFile("BidiBrackets.txt"), func(p *ucd.Parser) { r1 := p.Rune(0) r2 := p.Rune(1) xor := r1 ^ r2 if _, ok := xorMap[xor]; !ok { xorMap[xor] = len(xorMasks) xorMasks = append(xorMasks, xor) } entry := uint64(xorMap[xor]) << xorMaskShift switch p.String(2) { case "o": entry |= openMask case "c", "n": default: log.Fatalf("Unknown bracket class %q.", p.String(2)) } orMask[r1] = entry }) w.WriteComment(` xorMasks contains masks to be xor-ed with brackets to get the reverse version.`) w.WriteVar("xorMasks", xorMasks) done := map[rune]bool{} insert := func(r rune, c class) { if !done[r] { t.Insert(r, orMask[r]|uint64(c)) done[r] = true } } // Insert the derived BiDi properties. ucd.Parse(gen.OpenUCDFile("extracted/DerivedBidiClass.txt"), func(p *ucd.Parser) { r := p.Rune(0) class, ok := bidiClass[p.String(1)] if !ok { log.Fatalf("%U: Unknown BiDi class %q", r, p.String(1)) } insert(r, class) }) visitDefaults(insert) // TODO: use sparse blocks. This would reduce table size considerably // from the looks of it. sz, err := t.Gen(w) if err != nil { log.Fatal(err) } w.Size += sz }
func TestTables(t *testing.T) { testtext.SkipIfNotLong(t) lookup := func(r rune) info { v, _ := trie.lookupString(string(r)) return info(v) } ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) { r := p.Rune(0) x := lookup(r) if got, want := x.category(), catFromEntry(p); got != want { t.Errorf("%U:category: got %x; want %x", r, got, want) } mapped := false switch p.String(1) { case "mapped", "disallowed_STD3_mapped", "deviation": mapped = true } if x.isMapped() != mapped { t.Errorf("%U:isMapped: got %v; want %v", r, x.isMapped(), mapped) } if !mapped { return } want := string(p.Runes(2)) got := string(x.appendMapping(nil, string(r))) if got != want { t.Errorf("%U:mapping: got %+q; want %+q", r, got, want) } if x.isMapped() { return } wantMark := unicode.In(r, unicode.Mark) gotMark := x.isModifier() if gotMark != wantMark { t.Errorf("IsMark(%U) = %v; want %v", r, gotMark, wantMark) } }) ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { r := p.Rune(0) x := lookup(r) got := x.isViramaModifier() const cccVirama = 9 want := p.Int(ucd.CanonicalCombiningClass) == cccVirama if got != want { t.Errorf("IsVirama(%U) = %v; want %v", r, got, want) } }) ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) { r := p.Rune(0) x := lookup(r) if x.isMapped() { return } got := x.joinType() want := joinType[p.String(1)] if got != want { t.Errorf("JoinType(%U) = %x; want %x", r, got, want) } }) }
func parse(file string, f func(p *ucd.Parser)) { ucd.Parse(gen.OpenUCDFile(file), f) }
func main() { gen.Init() // Load data runes := []rune{} // PrecisIgnorableProperties: https://tools.ietf.org/html/rfc7564#section-9.13 ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) { if p.String(1) == "Default_Ignorable_Code_Point" { runes = append(runes, p.Rune(0)) } }) ucd.Parse(gen.OpenUCDFile("PropList.txt"), func(p *ucd.Parser) { switch p.String(1) { case "Noncharacter_Code_Point": runes = append(runes, p.Rune(0)) } }) // OldHangulJamo: https://tools.ietf.org/html/rfc5892#section-2.9 ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) { switch p.String(1) { case "L", "V", "T": runes = append(runes, p.Rune(0)) } }) disallowedRunes = rangetable.New(runes...) assigned = rangetable.Assigned(unicode.Version) // Load category data. runeCategory['l'] = latinSmallL ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { const cccVirama = 9 if p.Int(ucd.CanonicalCombiningClass) == cccVirama { setCategory(p.Rune(0), viramaModifier) } }) ucd.Parse(gen.OpenUCDFile("Scripts.txt"), func(p *ucd.Parser) { switch p.String(1) { case "Greek": setCategory(p.Rune(0), greek) case "Hebrew": setCategory(p.Rune(0), hebrew) case "Hiragana", "Katakana", "Han": setCategory(p.Rune(0), japanese) } }) // Set the rule categories associated with exceptions. This overrides any // previously set categories. The original categories are manually // reintroduced in the categoryTransitions table. for r, e := range exceptions { if e.cat != 0 { runeCategory[r] = e.cat } } cat := map[string]category{ "L": joiningL, "D": joiningD, "T": joiningT, "R": joiningR, } ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) { switch v := p.String(1); v { case "L", "D", "T", "R": setCategory(p.Rune(0), cat[v]) } }) writeTables() gen.Repackage("gen_trieval.go", "trieval.go", "precis") }
func genTables() { t := triegen.NewTrie("idna") ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { r := p.Rune(0) const cccVirama = 9 if p.Int(ucd.CanonicalCombiningClass) == cccVirama { runes[p.Rune(0)] = viramaModifier } switch { case unicode.In(r, unicode.Mark): runes[r] |= modifier } }) ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) { switch v := p.String(1); v { case "L", "D", "T", "R": runes[p.Rune(0)] |= joinType[v] << joinShift } }) ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) { r := p.Rune(0) // The mappings table explicitly defines surrogates as invalid. if !utf8.ValidRune(r) { return } cat := catFromEntry(p) isMapped := cat == mapped || cat == disallowedSTD3Mapped || cat == deviation if !isMapped { // Only include additional category information for non-mapped // runes. The additional information is only used after mapping and // the bits would clash with mapping information. // TODO: it would be possible to inline this data and avoid // additional lookups. This is quite tedious, though, so let's first // see if we need this. cat |= category(runes[r]) } s := string(p.Runes(2)) if s != "" && !isMapped { log.Fatalf("%U: Mapping with non-mapping category %d", r, cat) } t.Insert(r, uint64(makeEntry(r, s))+uint64(cat)) }) w := gen.NewCodeWriter() defer w.WriteGoFile("tables.go", "idna") gen.WriteUnicodeVersion(w) w.WriteVar("mappings", string(mappings)) w.WriteVar("xorData", string(xorData)) sz, err := t.Gen(w, triegen.Compact(&normCompacter{})) if err != nil { log.Fatal(err) } w.Size += sz }