func loadTestData() []Test { f := gen.OpenUnicodeFile("UCA", "", "CollationTest.zip") buffer, err := ioutil.ReadAll(f) f.Close() Error(err) archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer))) Error(err) tests := []Test{} for _, f := range archive.File { // Skip the short versions, which are simply duplicates of the long versions. if strings.Contains(f.Name, "SHORT") || f.FileInfo().IsDir() { continue } ff, err := f.Open() Error(err) defer ff.Close() scanner := bufio.NewScanner(ff) test := Test{name: path.Base(f.Name)} for scanner.Scan() { line := scanner.Text() if len(line) <= 1 || line[0] == '#' { if m := versionRe.FindStringSubmatch(line); m != nil { if m[1] != gen.UnicodeVersion() { log.Printf("warning:%s: version is %s; want %s", f.Name, m[1], gen.UnicodeVersion()) } } continue } m := testRe.FindStringSubmatch(line) if m == nil || len(m) < 3 { log.Fatalf(`Failed to parse: "%s" result: %#v`, line, m) } str := []byte{} // In the regression test data (unpaired) surrogates are assigned a weight // corresponding to their code point value. However, utf8.DecodeRune, // which is used to compute the implicit weight, assigns FFFD to surrogates. // We therefore skip tests with surrogates. This skips about 35 entries // per test. valid := true for _, split := range strings.Split(m[1], " ") { r, err := strconv.ParseUint(split, 16, 64) Error(err) valid = valid && utf8.ValidRune(rune(r)) str = append(str, string(rune(r))...) } if valid { test.str = append(test.str, str) test.comment = append(test.comment, m[2]) } } if scanner.Err() != nil { log.Fatal(scanner.Err()) } tests = append(tests, test) } return tests }
// parseUCA parses a Default Unicode Collation Element Table of the format // specified in http://www.unicode.org/reports/tr10/#File_Format. // It returns the variable top. func parseUCA(builder *build.Builder) { r := gen.OpenUnicodeFile("UCA", "", "allkeys.txt") defer r.Close() input := bufio.NewReader(r) colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`) for i := 1; true; i++ { l, prefix, err := input.ReadLine() if err == io.EOF { break } Error(err) line := string(l) if prefix { log.Fatalf("%d: buffer overflow", i) } if len(line) == 0 || line[0] == '#' { continue } if line[0] == '@' { if strings.HasPrefix(line[1:], "version ") { if v := strings.Split(line[1:], " ")[1]; v != gen.UnicodeVersion() { log.Fatalf("incompatible version %s; want %s", v, gen.UnicodeVersion()) } } } else { // parse entries part := strings.Split(line, " ; ") if len(part) != 2 { log.Fatalf("%d: production rule without ';': %v", i, line) } lhs := []rune{} for _, v := range strings.Split(part[0], " ") { if v != "" { lhs = append(lhs, rune(convHex(i, v))) } } vars := []int{} rhs := [][]int{} for i, m := range colelem.FindAllStringSubmatch(part[1], -1) { if m[1] == "*" { vars = append(vars, i) } elem := []int{} for _, h := range strings.Split(m[2], ".") { elem = append(elem, convHex(i, h)) } rhs = append(rhs, elem) } builder.Add(lhs, rhs, vars) } } }
func TestTables(t *testing.T) { testtext.SkipIfNotLong(t) lookup := func(r rune) info { v, _ := trie.lookupString(string(r)) return info(v) } ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) { r := p.Rune(0) x := lookup(r) if got, want := x.category(), catFromEntry(p); got != want { t.Errorf("%U:category: got %x; want %x", r, got, want) } mapped := false switch p.String(1) { case "mapped", "disallowed_STD3_mapped", "deviation": mapped = true } if x.isMapped() != mapped { t.Errorf("%U:isMapped: got %v; want %v", r, x.isMapped(), mapped) } if !mapped { return } want := string(p.Runes(2)) got := string(x.appendMapping(nil, string(r))) if got != want { t.Errorf("%U:mapping: got %+q; want %+q", r, got, want) } if x.isMapped() { return } wantMark := unicode.In(r, unicode.Mark) gotMark := x.isModifier() if gotMark != wantMark { t.Errorf("IsMark(%U) = %v; want %v", r, gotMark, wantMark) } }) ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { r := p.Rune(0) x := lookup(r) got := x.isViramaModifier() const cccVirama = 9 want := p.Int(ucd.CanonicalCombiningClass) == cccVirama if got != want { t.Errorf("IsVirama(%U) = %v; want %v", r, got, want) } }) ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) { r := p.Rune(0) x := lookup(r) if x.isMapped() { return } got := x.joinType() want := joinType[p.String(1)] if got != want { t.Errorf("JoinType(%U) = %x; want %x", r, got, want) } }) }
func TestConformance(t *testing.T) { testtext.SkipIfNotLong(t) r := gen.OpenUnicodeFile("idna", "", "IdnaTest.txt") defer r.Close() section := "main" started := false p := ucd.New(r, ucd.CommentHandler(func(s string) { if started { section = strings.ToLower(strings.Split(s, " ")[0]) } })) transitional := New(Transitional(true), VerifyDNSLength(true)) nonTransitional := New(VerifyDNSLength(true)) for p.Next() { started = true // What to test profiles := []*Profile{} switch p.String(0) { case "T": profiles = append(profiles, transitional) case "N": profiles = append(profiles, nonTransitional) case "B": profiles = append(profiles, transitional) profiles = append(profiles, nonTransitional) } src := unescape(p.String(1)) wantToUnicode := unescape(p.String(2)) if wantToUnicode == "" { wantToUnicode = src } wantToASCII := unescape(p.String(3)) if wantToASCII == "" { wantToASCII = wantToUnicode } wantErrToUnicode := "" if strings.HasPrefix(wantToUnicode, "[") { wantErrToUnicode = wantToUnicode wantToUnicode = "" } wantErrToASCII := "" if strings.HasPrefix(wantToASCII, "[") { wantErrToASCII = wantToASCII wantToASCII = "" } // TODO: also do IDNA tests. // invalidInIDNA2008 := p.String(4) == "NV8" for _, p := range profiles { name := fmt.Sprintf("%s:%s", section, p) doTest(t, p.ToUnicode, name+":ToUnicode", src, wantToUnicode, wantErrToUnicode) doTest(t, p.ToASCII, name+":ToASCII", src, wantToASCII, wantErrToASCII) } } }
func genTables() { t := triegen.NewTrie("idna") ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { r := p.Rune(0) const cccVirama = 9 if p.Int(ucd.CanonicalCombiningClass) == cccVirama { runes[p.Rune(0)] = viramaModifier } switch { case unicode.In(r, unicode.Mark): runes[r] |= modifier } }) ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) { switch v := p.String(1); v { case "L", "D", "T", "R": runes[p.Rune(0)] |= joinType[v] << joinShift } }) ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) { r := p.Rune(0) // The mappings table explicitly defines surrogates as invalid. if !utf8.ValidRune(r) { return } cat := catFromEntry(p) isMapped := cat == mapped || cat == disallowedSTD3Mapped || cat == deviation if !isMapped { // Only include additional category information for non-mapped // runes. The additional information is only used after mapping and // the bits would clash with mapping information. // TODO: it would be possible to inline this data and avoid // additional lookups. This is quite tedious, though, so let's first // see if we need this. cat |= category(runes[r]) } s := string(p.Runes(2)) if s != "" && !isMapped { log.Fatalf("%U: Mapping with non-mapping category %d", r, cat) } t.Insert(r, uint64(makeEntry(r, s))+uint64(cat)) }) w := gen.NewCodeWriter() defer w.WriteGoFile("tables.go", "idna") gen.WriteUnicodeVersion(w) w.WriteVar("mappings", string(mappings)) w.WriteVar("xorData", string(xorData)) sz, err := t.Gen(w, triegen.Compact(&normCompacter{})) if err != nil { log.Fatal(err) } w.Size += sz }
func TestConformance(t *testing.T) { testtext.SkipIfNotLong(t) r := gen.OpenUnicodeFile("idna", "", "IdnaTest.txt") defer r.Close() section := "main" started := false p := ucd.New(r, ucd.CommentHandler(func(s string) { if started { section = strings.ToLower(strings.Split(s, " ")[0]) } })) for p.Next() { started = true // What to test profiles := []*Profile{} switch p.String(0) { case "T": profiles = append(profiles, Transitional) case "N": profiles = append(profiles, NonTransitional) case "B": profiles = append(profiles, Transitional) profiles = append(profiles, NonTransitional) } src := unescape(p.String(1)) if incorrectTests[src] { continue } wantToUnicode := unescape(p.String(2)) if wantToUnicode == "" { wantToUnicode = src } wantToASCII := unescape(p.String(3)) if wantToASCII == "" { wantToASCII = wantToUnicode } test := "err:" if strings.HasPrefix(wantToUnicode, "[") { test += strings.Replace(strings.Trim(wantToUnicode, "[]"), " ", "", -1) } if strings.HasPrefix(wantToASCII, "[") { test += strings.Replace(strings.Trim(wantToASCII, "[]"), " ", "", -1) } if test == "err:" { test = "ok" } // TODO: also do IDNA tests. // invalidInIDNA2008 := p.String(4) == "NV8" for _, p := range profiles { testtext.Run(t, fmt.Sprintf("%s:%s/%s/%+q", section, test, p, src), func(t *testing.T) { got, err := p.ToUnicode(src) wantErr := strings.HasPrefix(wantToUnicode, "[") gotErr := err != nil if wantErr { if gotErr != wantErr { t.Errorf(`ToUnicode:err got %v; want %v (%s)`, gotErr, wantErr, wantToUnicode) } } else if got != wantToUnicode || gotErr != wantErr { t.Errorf(`ToUnicode: got %+q, %v (%v); want %+q, %v`, got, gotErr, err, wantToUnicode, wantErr) } got, err = p.ToASCII(src) wantErr = strings.HasPrefix(wantToASCII, "[") gotErr = err != nil if wantErr { if gotErr != wantErr { t.Errorf(`ToASCII:err got %v; want %v (%s)`, gotErr, wantErr, wantToASCII) } } else if got != wantToASCII || gotErr != wantErr { t.Errorf(`ToASCII: got %+q, %v (%v); want %+q, %v`, got, gotErr, err, wantToASCII, wantErr) } }) } } }