// Ensure that ceratain properties were generated correctly. func TestTable(t *testing.T) { tests := []tableTest{ tableTest{ rangetable.Merge( unicode.Lt, unicode.Nl, unicode.No, // Other letter digits unicode.Me, // Modifiers unicode.Zs, // Spaces unicode.So, // Symbols unicode.Pi, unicode.Pf, // Punctuation ), idDisOrFreePVal, }, tableTest{ rangetable.New(0x30000, 0x30101, 0xDFFFF), unassigned, }, } assigned := rangetable.Assigned(UnicodeVersion) for _, test := range tests { rangetable.Visit(test.rangeTable, func(r rune) { if !unicode.In(r, assigned) { return } b := make([]byte, 4) n := utf8.EncodeRune(b, r) trieval, _ := dpTrie.lookup(b[:n]) p := entry(trieval).property() if p != test.prop && !exceptions.Contains(r) { t.Errorf("%U: got %+x; want %+x", r, test.prop, p) } }) } }
func main() { gen.Init() versions := getVersions() w := &bytes.Buffer{} fmt.Fprintf(w, "//go:generate go run gen.go --versions=%s\n\n", strings.Join(versions, ",")) fmt.Fprintf(w, "import \"unicode\"\n\n") vstr := func(s string) string { return strings.Replace(s, ".", "_", -1) } fmt.Fprintf(w, "var assigned = map[string]*unicode.RangeTable{\n") for _, v := range versions { fmt.Fprintf(w, "\t%q: assigned%s,\n", v, vstr(v)) } fmt.Fprintf(w, "}\n\n") var size int for _, v := range versions { assigned := []rune{} r := gen.Open("http://www.unicode.org/Public/", "", v+"/ucd/UnicodeData.txt") ucd.Parse(r, func(p *ucd.Parser) { assigned = append(assigned, p.Rune(0)) }) rt := rangetable.New(assigned...) sz := int(reflect.TypeOf(unicode.RangeTable{}).Size()) sz += int(reflect.TypeOf(unicode.Range16{}).Size()) * len(rt.R16) sz += int(reflect.TypeOf(unicode.Range32{}).Size()) * len(rt.R32) fmt.Fprintf(w, "// size %d bytes (%d KiB)\n", sz, sz/1024) fmt.Fprintf(w, "var assigned%s = ", vstr(v)) print(w, rt) size += sz } fmt.Fprintf(w, "// Total size %d bytes (%d KiB)\n", size, size/1024) gen.WriteGoFile("tables.go", "rangetable", w.Bytes()) }
// Ensure that ceratain properties were generated correctly. func TestTable(t *testing.T) { tests := []tableTest{ tableTest{ rangetable.Merge( unicode.Lt, unicode.Nl, unicode.No, // Other letter digits unicode.Me, // Modifiers unicode.Zs, // Spaces unicode.So, // Symbols unicode.Pi, unicode.Pf, // Punctuation ), freePVal | idDis, }, tableTest{ rangetable.New(0x30000, 0x30101, 0xDFFFF), unassigned, }, } for _, test := range tests { test.run(t) } }
func main() { gen.Init() // Load data runes := []rune{} ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) { if p.String(1) == "Default_Ignorable_Code_Point" { runes = append(runes, p.Rune(0)) } }) ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) { if p.String(1) == "LVT" { runes = append(runes, p.Rune(0)) } }) disallowedRunes = rangetable.New(runes...) assigned = rangetable.Assigned(unicode.Version) writeTables() gen.Repackage("gen_trieval.go", "trieval.go", "precis") }
func main() { gen.Init() // Load data runes := []rune{} // PrecisIgnorableProperties: https://tools.ietf.org/html/rfc7564#section-9.13 ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) { if p.String(1) == "Default_Ignorable_Code_Point" { runes = append(runes, p.Rune(0)) } }) ucd.Parse(gen.OpenUCDFile("PropList.txt"), func(p *ucd.Parser) { switch p.String(1) { case "Noncharacter_Code_Point": runes = append(runes, p.Rune(0)) } }) // OldHangulJamo: https://tools.ietf.org/html/rfc5892#section-2.9 ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) { switch p.String(1) { case "L", "V", "T": runes = append(runes, p.Rune(0)) } }) disallowedRunes = rangetable.New(runes...) assigned = rangetable.Assigned(unicode.Version) // Load category data. runeCategory['l'] = latinSmallL ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { const cccVirama = 9 if p.Int(ucd.CanonicalCombiningClass) == cccVirama { setCategory(p.Rune(0), viramaModifier) } }) ucd.Parse(gen.OpenUCDFile("Scripts.txt"), func(p *ucd.Parser) { switch p.String(1) { case "Greek": setCategory(p.Rune(0), greek) case "Hebrew": setCategory(p.Rune(0), hebrew) case "Hiragana", "Katakana", "Han": setCategory(p.Rune(0), japanese) } }) // Set the rule categories associated with exceptions. This overrides any // previously set categories. The original categories are manually // reintroduced in the categoryTransitions table. for r, e := range exceptions { if e.cat != 0 { runeCategory[r] = e.cat } } cat := map[string]category{ "L": joiningL, "D": joiningD, "T": joiningT, "R": joiningR, } ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) { switch v := p.String(1); v { case "L", "D", "T", "R": setCategory(p.Rune(0), cat[v]) } }) writeTables() gen.Repackage("gen_trieval.go", "trieval.go", "precis") }
"bytes" "unicode" "unicode/utf8" "github.com/ReanGD/go-web-search/werrors" "golang.org/x/net/html" "golang.org/x/net/html/atom" "golang.org/x/text/unicode/rangetable" ) var notSeparatorRT = rangetable.New( '&', '-', '@', '_', '+', '\'', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я') type minificationText struct { } func (m *minificationText) isDigit(prev, cur, next rune) bool { return (cur == '.' || cur == ',' || cur == ':') && ('0' <= prev && prev <= '9') && ('0' <= next && next <= '9') } func (m *minificationText) processText(in string) string {