func genTables() { chars := parseUCD() verifyProperties(chars) t := triegen.NewTrie("case") for i := range chars { c := &chars[i] makeEntry(c) t.Insert(rune(i), uint64(c.entry)) } w := gen.NewCodeWriter() defer w.WriteGoFile("tables.go", "cases") gen.WriteUnicodeVersion(w) // TODO: write CLDR version after adding a mechanism to detect that the // tables on which the manually created locale-sensitive casing code is // based hasn't changed. w.WriteVar("xorData", string(xorData)) w.WriteVar("exceptions", string(exceptionData)) sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{})) if err != nil { log.Fatal(err) } w.Size += sz }
func main() { t := triegen.NewTrie("width") // wide is the base parse("EastAsianWidth.txt", func(p *ucd.Parser) { if contains(p.String(1), "W", "F") { t.Insert(p.Rune(0), widthTwo) } }) // zero overrides wide parse("extracted/DerivedGeneralCategory.txt", func(p *ucd.Parser) { cat := p.String(1) if cat == "Me" || cat == "Mn" { t.Insert(p.Rune(0), widthZero) } }) // misc overrides for _, v := range overrides { for r := v.from; r <= v.to; r++ { t.Insert(r, encodeWidth(v.width)) } } w := &bytes.Buffer{} gen.WriteUnicodeVersion(w) t.Gen(w) gen.WriteGoFile("tables.go", "runewidth", w.Bytes()) }
func genTables() { chars := parseUCD() verifyProperties(chars) t := triegen.NewTrie("case") for i := range chars { c := &chars[i] makeEntry(c) t.Insert(rune(i), uint64(c.entry)) } w := &bytes.Buffer{} sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{})) if err != nil { log.Fatal(err) } gen.WriteUnicodeVersion(w) // TODO: write CLDR version after adding a mechanism to detect that the // tables on which the manually created locale-sensitive casing code is // based hasn't changed. fmt.Fprintf(w, "// xorData: %d bytes\n", len(xorData)) fmt.Fprintf(w, "var xorData = %+q\n\n", string(xorData)) fmt.Fprintf(w, "// exceptions: %d bytes\n", len(exceptionData)) fmt.Fprintf(w, "var exceptions = %q\n\n", string(exceptionData)) sz += len(exceptionData) fmt.Fprintf(w, "// Total table size %d bytes (%dKiB)\n", sz, sz/1024) gen.WriteGoFile("tables.go", "cases", w.Bytes()) }
// ExampleGen_build demonstrates the creation of multiple tries sharing common // blocks. ExampleGen_lookup demonstrates how to use the generated tries. func ExampleGen_build() { var tries []*triegen.Trie rv := runeValues() for _, c := range []struct { include func(rune) bool name string }{ {func(r rune) bool { return true }, "all"}, {func(r rune) bool { return r < 0x80 }, "ASCII only"}, {func(r rune) bool { return r < 0x80 }, "ASCII only 2"}, {func(r rune) bool { return r <= 0xFFFF }, "BMP only"}, {func(r rune) bool { return r > 0xFFFF }, "No BMP"}, } { t := triegen.NewTrie(c.name) tries = append(tries, t) for r, v := range rv { if c.include(r) { t.Insert(r, v) } } } sz, err := triegen.Gen(genWriter, "multi", tries) fmt.Printf("Trie size: %d bytes\n", sz) fmt.Printf("Error: %v\n", err) // Output: // Trie size: 18250 bytes // Error: <nil> }
func genTables() { chars := parseUCD() verifyProperties(chars) t := triegen.NewTrie("case") for i := range chars { c := &chars[i] makeEntry(c) t.Insert(rune(i), uint64(c.entry)) } const file = "tables.go" w, err := os.Create(file + ".tmp") if err != nil { logger.Fatal(err) } fmt.Fprintf(w, header, *url) sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{})) if err != nil { logger.Fatal(err) } fmt.Fprintf(w, "// exceptions: %d bytes\n", len(exceptionData)) fmt.Fprintf(w, "var exceptions = %q\n\n", string(exceptionData)) sz += len(exceptionData) fmt.Fprintf(w, "// Total table size %d bytes (%dKiB)\n", sz, sz/1024) if err := os.Rename(file+".tmp", file); err != nil { logger.Fatalf("Rename to file %v failed.", file) } exec.Command("gofmt", "-w", file).Run() }
// Example_build shows how to build a simple trie. It assigns the value 1 to // 100 random runes generated by randomRunes. func Example_build() { t := triegen.NewTrie("rand") for r := range randomRunes() { t.Insert(r, 1) } sz, err := t.Gen(genWriter) fmt.Printf("Trie size: %d bytes\n", sz) fmt.Printf("Error: %v\n", err) // Output: // Trie size: 9280 bytes // Error: <nil> }
func writeTables() { propTrie := triegen.NewTrie("derivedProperties") w := gen.NewCodeWriter() defer w.WriteGoFile(*outputFile, "precis") gen.WriteUnicodeVersion(w) // Iterate over all the runes... for i := rune(0); i < unicode.MaxRune; i++ { r := rune(i) if !utf8.ValidRune(r) { continue } e, ok := exceptions[i] p := e.prop switch { case ok: case !unicode.In(r, assigned): p = unassigned case r >= 0x0021 && r <= 0x007e: // Is ASCII 7 p = pValid case unicode.In(r, disallowedRunes, unicode.Cc): p = disallowed case hasCompat(r): p = idDisOrFreePVal case isLetterDigits(r): p = pValid case isIdDisAndFreePVal(r): p = idDisOrFreePVal default: p = disallowed } cat := runeCategory[r] // Don't set category for runes that are disallowed. if p == disallowed { cat = exceptions[r].cat } propTrie.Insert(r, uint64(p)|uint64(cat)) } sz, err := propTrie.Gen(w) if err != nil { log.Fatal(err) } w.Size += sz }
func ExampleCompacter() { t := triegen.NewTrie("root") for r := rune(0); r < 10000; r += 64 { t.Insert(r, 0x9015BADA55^uint64(r)) } sz, _ := t.Gen(ioutil.Discard) fmt.Printf("Size normal: %5d\n", sz) var c myCompacter sz, _ = t.Gen(ioutil.Discard, triegen.Compact(&c)) fmt.Printf("Size compacted: %5d\n", sz) // Output: // Size normal: 81344 // Size compacted: 3224 }
func writeTables() { propTrie := triegen.NewTrie("derivedProperties") w := gen.NewCodeWriter() defer w.WriteGoFile(*outputFile, "precis") gen.WriteUnicodeVersion(w) // Iterate over all the runes... for i := uint32(0); i < unicode.MaxRune; i++ { r := rune(i) if !utf8.ValidRune(r) { continue } p, ok := exceptions[i] switch { case ok: case !unicode.In(r, assigned): p = unassigned case r >= 33 && r <= 126: // Is ASCII 7 p = pValid case r == 0x200C || r == 0x200D: // Is join control p = contextJ case unicode.In(r, disallowedRunes, unicode.Cc): p = disallowed case isHasCompat(r): p = idDis | freePVal case isLetterDigits(r): p = pValid case isIdDisAndFreePVal(r): p = idDis | freePVal default: p = disallowed } propTrie.Insert(r, uint64(p)) } sz, err := propTrie.Gen(w) if err != nil { log.Fatal(err) } w.Size += sz }
func genTables() { t := triegen.NewTrie("width") // fold and inverse mappings. See mapComment for a description of the format // of each entry. Add dummy value to make an index of 0 mean no mapping. inverse := [][4]byte{{}} mapping := map[[4]byte]int{[4]byte{}: 0} getWidthData(func(r rune, tag elem, alt rune) { idx := 0 if alt != 0 { var buf [4]byte buf[0] = byte(utf8.EncodeRune(buf[1:], alt)) s := string(r) buf[buf[0]] ^= s[len(s)-1] var ok bool if idx, ok = mapping[buf]; !ok { idx = len(mapping) if idx > math.MaxUint8 { log.Fatalf("Index %d does not fit in a byte.", idx) } mapping[buf] = idx inverse = append(inverse, buf) } } t.Insert(r, uint64(tag|elem(idx))) }) w := &bytes.Buffer{} gen.WriteUnicodeVersion(w) sz, err := t.Gen(w) if err != nil { log.Fatal(err) } sz += writeMappings(w, inverse) fmt.Fprintf(w, "// Total table size %d bytes (%dKiB)\n", sz, sz/1024) gen.WriteGoFile(*outputFile, "width", w.Bytes()) }
func genTables() { if numClass > 0x0F { log.Fatalf("Too many Class constants (%#x > 0x0F).", numClass) } w := gen.NewCodeWriter() defer w.WriteGoFile(*outputFile, "bidi") gen.WriteUnicodeVersion(w) t := triegen.NewTrie("bidi") // Build data about bracket mapping. These bits need to be or-ed with // any other bits. orMask := map[rune]uint64{} xorMap := map[rune]int{} xorMasks := []rune{0} // First value is no-op. parse("BidiBrackets.txt", func(p *ucd.Parser) { r1 := p.Rune(0) r2 := p.Rune(1) xor := r1 ^ r2 if _, ok := xorMap[xor]; !ok { xorMap[xor] = len(xorMasks) xorMasks = append(xorMasks, xor) } entry := uint64(xorMap[xor]) << xorMaskShift switch p.String(2) { case "o": entry |= openMask case "c", "n": default: log.Fatalf("Unknown bracket class %q.", p.String(2)) } orMask[r1] = entry }) w.WriteComment(` xorMasks contains masks to be xor-ed with brackets to get the reverse version.`) w.WriteVar("xorMasks", xorMasks) done := map[rune]bool{} insert := func(r rune, c class) { if !done[r] { t.Insert(r, orMask[r]|uint64(c)) done[r] = true } } // Insert the derived BiDi properties. parse("extracted/DerivedBidiClass.txt", func(p *ucd.Parser) { r := p.Rune(0) class, ok := bidiClass[p.String(1)] if !ok { log.Fatalf("%U: Unknown BiDi class %q", r, p.String(1)) } insert(r, class) }) visitDefaults(insert) // TODO: use sparse blocks. This would reduce table size considerably // from the looks of it. sz, err := t.Gen(w) if err != nil { log.Fatal(err) } w.Size += sz }
func printCharInfoTables(w io.Writer) int { mkstr := func(r rune, f *FormInfo) (int, string) { d := f.expandedDecomp s := string([]rune(d)) if max := 1 << 6; len(s) >= max { const msg = "%U: too many bytes in decomposition: %d >= %d" log.Fatalf(msg, r, len(s), max) } head := uint8(len(s)) if f.quickCheck[MComposed] != QCYes { head |= 0x40 } if f.combinesForward { head |= 0x80 } s = string([]byte{head}) + s lccc := ccc(d[0]) tccc := ccc(d[len(d)-1]) cc := ccc(r) if cc != 0 && lccc == 0 && tccc == 0 { log.Fatalf("%U: trailing and leading ccc are 0 for non-zero ccc %d", r, cc) } if tccc < lccc && lccc != 0 { const msg = "%U: lccc (%d) must be <= tcc (%d)" log.Fatalf(msg, r, lccc, tccc) } index := normalDecomp nTrail := chars[r].nTrailingNonStarters if tccc > 0 || lccc > 0 || nTrail > 0 { tccc <<= 2 tccc |= nTrail s += string([]byte{tccc}) index = endMulti for _, r := range d[1:] { if ccc(r) == 0 { index = firstCCC } } if lccc > 0 { s += string([]byte{lccc}) if index == firstCCC { log.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r) } index = firstLeadingCCC } if cc != lccc { if cc != 0 { log.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc) } index = firstCCCZeroExcept } } else if len(d) > 1 { index = firstMulti } return index, s } decompSet := makeDecompSet() const nLeadStr = "\x00\x01" // 0-byte length and tccc with nTrail. decompSet.insert(firstStarterWithNLead, nLeadStr) // Store the uniqued decompositions in a byte buffer, // preceded by their byte length. for _, c := range chars { for _, f := range c.forms { if len(f.expandedDecomp) == 0 { continue } if f.combinesBackward { log.Fatalf("%U: combinesBackward and decompose", c.codePoint) } index, s := mkstr(c.codePoint, &f) decompSet.insert(index, s) } } decompositions := bytes.NewBuffer(make([]byte, 0, 10000)) size := 0 positionMap := make(map[string]uint16) decompositions.WriteString("\000") fmt.Fprintln(w, "const (") for i, m := range decompSet { sa := []string{} for s := range m { sa = append(sa, s) } sort.Strings(sa) for _, s := range sa { p := decompositions.Len() decompositions.WriteString(s) positionMap[s] = uint16(p) } if cname[i] != "" { fmt.Fprintf(w, "%s = 0x%X\n", cname[i], decompositions.Len()) } } fmt.Fprintln(w, "maxDecomp = 0x8000") fmt.Fprintln(w, ")") b := decompositions.Bytes() printBytes(w, b, "decomps") size += len(b) varnames := []string{"nfc", "nfkc"} for i := 0; i < FNumberOfFormTypes; i++ { trie := triegen.NewTrie(varnames[i]) for r, c := range chars { f := c.forms[i] d := f.expandedDecomp if len(d) != 0 { _, key := mkstr(c.codePoint, &f) trie.Insert(rune(r), uint64(positionMap[key])) if c.ccc != ccc(d[0]) { // We assume the lead ccc of a decomposition !=0 in this case. if ccc(d[0]) == 0 { log.Fatalf("Expected leading CCC to be non-zero; ccc is %d", c.ccc) } } } else if c.nLeadingNonStarters > 0 && len(f.expandedDecomp) == 0 && c.ccc == 0 && !f.combinesBackward { // Handle cases where it can't be detected that the nLead should be equal // to nTrail. trie.Insert(c.codePoint, uint64(positionMap[nLeadStr])) } else if v := makeEntry(&f, &c)<<8 | uint16(c.ccc); v != 0 { trie.Insert(c.codePoint, uint64(0x8000|v)) } } sz, err := trie.Gen(w, triegen.Compact(&normCompacter{name: varnames[i]})) if err != nil { log.Fatal(err) } size += sz } return size }
func genTables() { t := triegen.NewTrie("idna") ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { r := p.Rune(0) const cccVirama = 9 if p.Int(ucd.CanonicalCombiningClass) == cccVirama { runes[p.Rune(0)] = viramaModifier } switch { case unicode.In(r, unicode.Mark): runes[r] |= modifier } }) ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) { switch v := p.String(1); v { case "L", "D", "T", "R": runes[p.Rune(0)] |= joinType[v] << joinShift } }) ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) { r := p.Rune(0) // The mappings table explicitly defines surrogates as invalid. if !utf8.ValidRune(r) { return } cat := catFromEntry(p) isMapped := cat == mapped || cat == disallowedSTD3Mapped || cat == deviation if !isMapped { // Only include additional category information for non-mapped // runes. The additional information is only used after mapping and // the bits would clash with mapping information. // TODO: it would be possible to inline this data and avoid // additional lookups. This is quite tedious, though, so let's first // see if we need this. cat |= category(runes[r]) } s := string(p.Runes(2)) if s != "" && !isMapped { log.Fatalf("%U: Mapping with non-mapping category %d", r, cat) } t.Insert(r, uint64(makeEntry(r, s))+uint64(cat)) }) w := gen.NewCodeWriter() defer w.WriteGoFile("tables.go", "idna") gen.WriteUnicodeVersion(w) w.WriteVar("mappings", string(mappings)) w.WriteVar("xorData", string(xorData)) sz, err := t.Gen(w, triegen.Compact(&normCompacter{})) if err != nil { log.Fatal(err) } w.Size += sz }