// getWidthData calls f for every entry for which it is defined. // // f may be called multiple times for the same rune. The last call to f is the // correct value. f is not called for all runes. The default tag type is // Neutral. func getWidthData(f func(r rune, tag elem, alt rune)) { // Set the default values for Unified Ideographs. In line with Annex 11, // we encode full ranges instead of the defined runes in Unified_Ideograph. for _, b := range []struct{ lo, hi rune }{ {0x4E00, 0x9FFF}, // the CJK Unified Ideographs block, {0x3400, 0x4DBF}, // the CJK Unified Ideographs Externsion A block, {0xF900, 0xFAFF}, // the CJK Compatibility Ideographs block, {0x20000, 0x2FFFF}, // the Supplementary Ideographic Plane, {0x30000, 0x3FFFF}, // the Tertiary Ideographic Plane, } { for r := b.lo; r <= b.hi; r++ { f(r, tagWide, 0) } } inverse := map[rune]rune{} maps := map[string]bool{ "<wide>": true, "<narrow>": true, } // We cannot reuse package norm's decomposition, as we need an unexpanded // decomposition. We make use of the opportunity to verify that the // decomposition type is as expected. ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { r := p.Rune(0) s := strings.SplitN(p.String(ucd.DecompMapping), " ", 2) if !maps[s[0]] { return } x, err := strconv.ParseUint(s[1], 16, 32) if err != nil { log.Fatalf("Error parsing rune %q", s[1]) } if inverse[r] != 0 || inverse[rune(x)] != 0 { log.Fatalf("Circular dependency in mapping between %U and %U", r, x) } inverse[r] = rune(x) inverse[rune(x)] = r }) // <rune range>;<type> ucd.Parse(gen.OpenUCDFile("EastAsianWidth.txt"), func(p *ucd.Parser) { tag, ok := typeMap[p.String(1)] if !ok { log.Fatalf("Unknown width type %q", p.String(1)) } r := p.Rune(0) alt, ok := inverse[r] if tag == tagFullwidth || tag == tagHalfwidth && r != wonSign { tag |= tagNeedsFold if !ok { log.Fatalf("Narrow or wide rune %U has no decomposition", r) } } f(r, tag, alt) }) }
// These tables are hand-extracted from: // http://www.unicode.org/Public/8.0.0/ucd/extracted/DerivedBidiClass.txt func visitDefaults(fn func(r rune, c Class)) { // first write default values for ranges listed above. visitRunes(fn, AL, []rune{ 0x0600, 0x07BF, // Arabic 0x08A0, 0x08FF, // Arabic Extended-A 0xFB50, 0xFDCF, // Arabic Presentation Forms 0xFDF0, 0xFDFF, 0xFE70, 0xFEFF, 0x0001EE00, 0x0001EEFF, // Arabic Mathematical Alpha Symbols }) visitRunes(fn, R, []rune{ 0x0590, 0x05FF, // Hebrew 0x07C0, 0x089F, // Nko et al. 0xFB1D, 0xFB4F, 0x00010800, 0x00010FFF, // Cypriot Syllabary et. al. 0x0001E800, 0x0001EDFF, 0x0001EF00, 0x0001EFFF, }) visitRunes(fn, ET, []rune{ // European Terminator 0x20A0, 0x20Cf, // Currency symbols }) rangetable.Visit(unicode.Noncharacter_Code_Point, func(r rune) { fn(r, BN) // Boundary Neutral }) ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) { if p.String(1) == "Default_Ignorable_Code_Point" { fn(p.Rune(0), BN) // Boundary Neutral } }) }
func TestName(t *testing.T) { testtext.SkipIfNotLong(t) wants := make([]string, 1+unicode.MaxRune) ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { r, s := p.Rune(0), p.String(ucd.Name) if s == "" { return } if s[0] == '<' { const first = ", First>" if i := strings.Index(s, first); i >= 0 { s = s[:i] + ">" } } wants[r] = s }) nErrors := 0 for r, want := range wants { got := Name(rune(r)) if got != want { t.Errorf("r=%#08x: got %q, want %q", r, got, want) nErrors++ if nErrors == 100 { t.Fatal("too many errors") } } } }
func TestTables(t *testing.T) { testtext.SkipIfNotLong(t) ucd.Parse(gen.OpenUCDFile("BidiBrackets.txt"), func(p *ucd.Parser) { r1 := p.Rune(0) want := p.Rune(1) e, _ := LookupRune(r1) if got := e.reverseBracket(r1); got != want { t.Errorf("Reverse(%U) = %U; want %U", r1, got, want) } }) done := map[rune]bool{} test := func(name string, r rune, want string) { str := string(r) e, _ := LookupString(str) if got := labels[e.Class()]; got != want { t.Errorf("%s:%U: got %s; want %s", name, r, got, want) } if e2, sz := LookupRune(r); e != e2 || sz != len(str) { t.Errorf("LookupRune(%U) = %v, %d; want %v, %d", r, e2, e, sz, len(str)) } if e2, sz := Lookup([]byte(str)); e != e2 || sz != len(str) { t.Errorf("Lookup(%U) = %v, %d; want %v, %d", r, e2, e, sz, len(str)) } done[r] = true } // Insert the derived BiDi properties. ucd.Parse(gen.OpenUCDFile("extracted/DerivedBidiClass.txt"), func(p *ucd.Parser) { r := p.Rune(0) test("derived", r, p.String(1)) }) visitDefaults(func(r rune, c Class) { if !done[r] { test("default", r, labels[c]) } }) }
func TestTables(t *testing.T) { if !*long { return } gen.Init() trie := newBidiTrie(0) ucd.Parse(gen.OpenUCDFile("BidiBrackets.txt"), func(p *ucd.Parser) { r1 := p.Rune(0) want := p.Rune(1) e, _ := trie.lookupString(string(r1)) if got := entry(e).reverseBracket(r1); got != want { t.Errorf("Reverse(%U) = %U; want %U", r1, got, want) } }) done := map[rune]bool{} test := func(name string, r rune, want string) { e, _ := trie.lookupString(string(r)) if got := labels[entry(e).class(r)]; got != want { t.Errorf("%s:%U: got %s; want %s", name, r, got, want) } done[r] = true } // Insert the derived BiDi properties. ucd.Parse(gen.OpenUCDFile("extracted/DerivedBidiClass.txt"), func(p *ucd.Parser) { r := p.Rune(0) test("derived", r, p.String(1)) }) visitDefaults(func(r rune, c class) { if !done[r] { test("default", r, labels[c]) } }) }
func main() { gen.Init() // Load data runes := []rune{} ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) { if p.String(1) == "Default_Ignorable_Code_Point" { runes = append(runes, p.Rune(0)) } }) ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) { if p.String(1) == "LVT" { runes = append(runes, p.Rune(0)) } }) disallowedRunes = rangetable.New(runes...) assigned = rangetable.Assigned(unicode.Version) writeTables() gen.Repackage("gen_trieval.go", "trieval.go", "precis") }
func main() { gen.Init() versions := getVersions() w := &bytes.Buffer{} fmt.Fprintf(w, "//go:generate go run gen.go --versions=%s\n\n", strings.Join(versions, ",")) fmt.Fprintf(w, "import \"unicode\"\n\n") vstr := func(s string) string { return strings.Replace(s, ".", "_", -1) } fmt.Fprintf(w, "var assigned = map[string]*unicode.RangeTable{\n") for _, v := range versions { fmt.Fprintf(w, "\t%q: assigned%s,\n", v, vstr(v)) } fmt.Fprintf(w, "}\n\n") var size int for _, v := range versions { assigned := []rune{} r := gen.Open("http://www.unicode.org/Public/", "", v+"/ucd/UnicodeData.txt") ucd.Parse(r, func(p *ucd.Parser) { assigned = append(assigned, p.Rune(0)) }) rt := rangetable.New(assigned...) sz := int(reflect.TypeOf(unicode.RangeTable{}).Size()) sz += int(reflect.TypeOf(unicode.Range16{}).Size()) * len(rt.R16) sz += int(reflect.TypeOf(unicode.Range32{}).Size()) * len(rt.R32) fmt.Fprintf(w, "// size %d bytes (%d KiB)\n", sz, sz/1024) fmt.Fprintf(w, "var assigned%s = ", vstr(v)) print(w, rt) size += sz } fmt.Fprintf(w, "// Total size %d bytes (%d KiB)\n", size, size/1024) gen.WriteGoFile("tables.go", "rangetable", w.Bytes()) }
func parse() (names []string, counts map[string]int) { names = make([]string, 1+unicode.MaxRune) counts = map[string]int{} ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { r, s := p.Rune(0), p.String(ucd.Name) if s == "" { return } if s[0] == '<' { const first = ", First>" if i := strings.Index(s, first); i >= 0 { s = s[:i] + ">" } } names[r] = s counts[s]++ }) return names, counts }
// TestBidiCharacters performs the tests in BidiCharacterTest.txt. // See http://www.unicode.org/Public/UCD/latest/ucd/BidiCharacterTest.txt func TestBidiCharacters(t *testing.T) { testtext.SkipIfNotLong(t) ucd.Parse(gen.OpenUCDFile("BidiCharacterTest.txt"), func(p *ucd.Parser) { var ( types []class pairTypes []bracketType pairValues []rune parLevel level wantLevel = level(p.Int(2)) wantLevels = p.Strings(3) wantVisualOrder = p.Strings(4) ) switch l := p.Int(1); l { case 0, 1: parLevel = level(l) case 2: parLevel = implicitLevel default: // Spec says to ignore unknown parts. } trie := newBidiTrie(0) runes := p.Runes(0) for _, r := range runes { // Assign the bracket type. if d := norm.NFKD.PropertiesString(string(r)).Decomposition(); d != nil { r = []rune(string(d))[0] } e, _ := trie.lookupString(string(r)) entry := entry(e) // Assign the class for this rune. types = append(types, entry.class(r)) switch { case !entry.isBracket(): pairTypes = append(pairTypes, bpNone) pairValues = append(pairValues, 0) case entry.isOpen(): pairTypes = append(pairTypes, bpOpen) pairValues = append(pairValues, r) default: pairTypes = append(pairTypes, bpClose) pairValues = append(pairValues, entry.reverseBracket(r)) } } par := newParagraph(types, pairTypes, pairValues, parLevel) // Test results: if got := par.embeddingLevel; got != wantLevel { t.Errorf("%v:level: got %d; want %d", string(runes), got, wantLevel) } if *testLevels { gotLevels := getLevelStrings(types, par.resultLevels) if got, want := fmt.Sprint(gotLevels), fmt.Sprint(wantLevels); got != want { t.Errorf("%04X %q:%d: got %v; want %v\nval: %x\npair: %v", runes, string(runes), parLevel, got, want, pairValues, pairTypes) } } order := par.getReordering([]int{len(types)}) order = filterOrder(types, order) if got, want := fmt.Sprint(order), fmt.Sprint(wantVisualOrder); got != want { t.Errorf("%04X %q:%d: got %v; want %v\ngot order: %s", runes, string(runes), parLevel, got, want, reorder(runes, order)) } }) }
func TestTables(t *testing.T) { testtext.SkipIfNotLong(t) lookup := func(r rune) info { v, _ := trie.lookupString(string(r)) return info(v) } ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) { r := p.Rune(0) x := lookup(r) if got, want := x.category(), catFromEntry(p); got != want { t.Errorf("%U:category: got %x; want %x", r, got, want) } mapped := false switch p.String(1) { case "mapped", "disallowed_STD3_mapped", "deviation": mapped = true } if x.isMapped() != mapped { t.Errorf("%U:isMapped: got %v; want %v", r, x.isMapped(), mapped) } if !mapped { return } want := string(p.Runes(2)) got := string(x.appendMapping(nil, string(r))) if got != want { t.Errorf("%U:mapping: got %+q; want %+q", r, got, want) } if x.isMapped() { return } wantMark := unicode.In(r, unicode.Mark) gotMark := x.isModifier() if gotMark != wantMark { t.Errorf("IsMark(%U) = %v; want %v", r, gotMark, wantMark) } }) ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { r := p.Rune(0) x := lookup(r) got := x.isViramaModifier() const cccVirama = 9 want := p.Int(ucd.CanonicalCombiningClass) == cccVirama if got != want { t.Errorf("IsVirama(%U) = %v; want %v", r, got, want) } }) ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) { r := p.Rune(0) x := lookup(r) if x.isMapped() { return } got := x.joinType() want := joinType[p.String(1)] if got != want { t.Errorf("JoinType(%U) = %x; want %x", r, got, want) } }) }
func parse(file string, f func(p *ucd.Parser)) { ucd.Parse(gen.OpenUCDFile(file), f) }
func genTables() { t := triegen.NewTrie("idna") ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { r := p.Rune(0) const cccVirama = 9 if p.Int(ucd.CanonicalCombiningClass) == cccVirama { runes[p.Rune(0)] = viramaModifier } switch { case unicode.In(r, unicode.Mark): runes[r] |= modifier } }) ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) { switch v := p.String(1); v { case "L", "D", "T", "R": runes[p.Rune(0)] |= joinType[v] << joinShift } }) ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) { r := p.Rune(0) // The mappings table explicitly defines surrogates as invalid. if !utf8.ValidRune(r) { return } cat := catFromEntry(p) isMapped := cat == mapped || cat == disallowedSTD3Mapped || cat == deviation if !isMapped { // Only include additional category information for non-mapped // runes. The additional information is only used after mapping and // the bits would clash with mapping information. // TODO: it would be possible to inline this data and avoid // additional lookups. This is quite tedious, though, so let's first // see if we need this. cat |= category(runes[r]) } s := string(p.Runes(2)) if s != "" && !isMapped { log.Fatalf("%U: Mapping with non-mapping category %d", r, cat) } t.Insert(r, uint64(makeEntry(r, s))+uint64(cat)) }) w := gen.NewCodeWriter() defer w.WriteGoFile("tables.go", "idna") gen.WriteUnicodeVersion(w) w.WriteVar("mappings", string(mappings)) w.WriteVar("xorData", string(xorData)) sz, err := t.Gen(w, triegen.Compact(&normCompacter{})) if err != nil { log.Fatal(err) } w.Size += sz }
func main() { gen.Init() // Load data runes := []rune{} // PrecisIgnorableProperties: https://tools.ietf.org/html/rfc7564#section-9.13 ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) { if p.String(1) == "Default_Ignorable_Code_Point" { runes = append(runes, p.Rune(0)) } }) ucd.Parse(gen.OpenUCDFile("PropList.txt"), func(p *ucd.Parser) { switch p.String(1) { case "Noncharacter_Code_Point": runes = append(runes, p.Rune(0)) } }) // OldHangulJamo: https://tools.ietf.org/html/rfc5892#section-2.9 ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) { switch p.String(1) { case "L", "V", "T": runes = append(runes, p.Rune(0)) } }) disallowedRunes = rangetable.New(runes...) assigned = rangetable.Assigned(unicode.Version) // Load category data. runeCategory['l'] = latinSmallL ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { const cccVirama = 9 if p.Int(ucd.CanonicalCombiningClass) == cccVirama { setCategory(p.Rune(0), viramaModifier) } }) ucd.Parse(gen.OpenUCDFile("Scripts.txt"), func(p *ucd.Parser) { switch p.String(1) { case "Greek": setCategory(p.Rune(0), greek) case "Hebrew": setCategory(p.Rune(0), hebrew) case "Hiragana", "Katakana", "Han": setCategory(p.Rune(0), japanese) } }) // Set the rule categories associated with exceptions. This overrides any // previously set categories. The original categories are manually // reintroduced in the categoryTransitions table. for r, e := range exceptions { if e.cat != 0 { runeCategory[r] = e.cat } } cat := map[string]category{ "L": joiningL, "D": joiningD, "T": joiningT, "R": joiningR, } ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) { switch v := p.String(1); v { case "L", "D", "T", "R": setCategory(p.Rune(0), cat[v]) } }) writeTables() gen.Repackage("gen_trieval.go", "trieval.go", "precis") }
func genTables() { if numClass > 0x0F { log.Fatalf("Too many Class constants (%#x > 0x0F).", numClass) } w := gen.NewCodeWriter() defer w.WriteGoFile(*outputFile, "bidi") gen.WriteUnicodeVersion(w) t := triegen.NewTrie("bidi") // Build data about bracket mapping. These bits need to be or-ed with // any other bits. orMask := map[rune]uint64{} xorMap := map[rune]int{} xorMasks := []rune{0} // First value is no-op. ucd.Parse(gen.OpenUCDFile("BidiBrackets.txt"), func(p *ucd.Parser) { r1 := p.Rune(0) r2 := p.Rune(1) xor := r1 ^ r2 if _, ok := xorMap[xor]; !ok { xorMap[xor] = len(xorMasks) xorMasks = append(xorMasks, xor) } entry := uint64(xorMap[xor]) << xorMaskShift switch p.String(2) { case "o": entry |= openMask case "c", "n": default: log.Fatalf("Unknown bracket class %q.", p.String(2)) } orMask[r1] = entry }) w.WriteComment(` xorMasks contains masks to be xor-ed with brackets to get the reverse version.`) w.WriteVar("xorMasks", xorMasks) done := map[rune]bool{} insert := func(r rune, c class) { if !done[r] { t.Insert(r, orMask[r]|uint64(c)) done[r] = true } } // Insert the derived BiDi properties. ucd.Parse(gen.OpenUCDFile("extracted/DerivedBidiClass.txt"), func(p *ucd.Parser) { r := p.Rune(0) class, ok := bidiClass[p.String(1)] if !ok { log.Fatalf("%U: Unknown BiDi class %q", r, p.String(1)) } insert(r, class) }) visitDefaults(insert) // TODO: use sparse blocks. This would reduce table size considerably // from the looks of it. sz, err := t.Gen(w) if err != nil { log.Fatal(err) } w.Size += sz }