Beispiel #1
0
func loadTestData() []Test {
	f := gen.OpenUnicodeFile("UCA", "", "CollationTest.zip")
	buffer, err := ioutil.ReadAll(f)
	f.Close()
	Error(err)
	archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
	Error(err)
	tests := []Test{}
	for _, f := range archive.File {
		// Skip the short versions, which are simply duplicates of the long versions.
		if strings.Contains(f.Name, "SHORT") || f.FileInfo().IsDir() {
			continue
		}
		ff, err := f.Open()
		Error(err)
		defer ff.Close()
		scanner := bufio.NewScanner(ff)
		test := Test{name: path.Base(f.Name)}
		for scanner.Scan() {
			line := scanner.Text()
			if len(line) <= 1 || line[0] == '#' {
				if m := versionRe.FindStringSubmatch(line); m != nil {
					if m[1] != gen.UnicodeVersion() {
						log.Printf("warning:%s: version is %s; want %s", f.Name, m[1], gen.UnicodeVersion())
					}
				}
				continue
			}
			m := testRe.FindStringSubmatch(line)
			if m == nil || len(m) < 3 {
				log.Fatalf(`Failed to parse: "%s" result: %#v`, line, m)
			}
			str := []byte{}
			// In the regression test data (unpaired) surrogates are assigned a weight
			// corresponding to their code point value.  However, utf8.DecodeRune,
			// which is used to compute the implicit weight, assigns FFFD to surrogates.
			// We therefore skip tests with surrogates.  This skips about 35 entries
			// per test.
			valid := true
			for _, split := range strings.Split(m[1], " ") {
				r, err := strconv.ParseUint(split, 16, 64)
				Error(err)
				valid = valid && utf8.ValidRune(rune(r))
				str = append(str, string(rune(r))...)
			}
			if valid {
				test.str = append(test.str, str)
				test.comment = append(test.comment, m[2])
			}
		}
		if scanner.Err() != nil {
			log.Fatal(scanner.Err())
		}
		tests = append(tests, test)
	}
	return tests
}
Beispiel #2
0
// parseUCA parses a Default Unicode Collation Element Table of the format
// specified in http://www.unicode.org/reports/tr10/#File_Format.
// It returns the variable top.
func parseUCA(builder *build.Builder) {
	r := gen.OpenUnicodeFile("UCA", "", "allkeys.txt")
	defer r.Close()
	input := bufio.NewReader(r)
	colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
	for i := 1; true; i++ {
		l, prefix, err := input.ReadLine()
		if err == io.EOF {
			break
		}
		Error(err)
		line := string(l)
		if prefix {
			log.Fatalf("%d: buffer overflow", i)
		}
		if len(line) == 0 || line[0] == '#' {
			continue
		}
		if line[0] == '@' {
			if strings.HasPrefix(line[1:], "version ") {
				if v := strings.Split(line[1:], " ")[1]; v != gen.UnicodeVersion() {
					log.Fatalf("incompatible version %s; want %s", v, gen.UnicodeVersion())
				}
			}
		} else {
			// parse entries
			part := strings.Split(line, " ; ")
			if len(part) != 2 {
				log.Fatalf("%d: production rule without ';': %v", i, line)
			}
			lhs := []rune{}
			for _, v := range strings.Split(part[0], " ") {
				if v != "" {
					lhs = append(lhs, rune(convHex(i, v)))
				}
			}
			vars := []int{}
			rhs := [][]int{}
			for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
				if m[1] == "*" {
					vars = append(vars, i)
				}
				elem := []int{}
				for _, h := range strings.Split(m[2], ".") {
					elem = append(elem, convHex(i, h))
				}
				rhs = append(rhs, elem)
			}
			builder.Add(lhs, rhs, vars)
		}
	}
}
Beispiel #3
0
func TestTables(t *testing.T) {
	testtext.SkipIfNotLong(t)

	lookup := func(r rune) info {
		v, _ := trie.lookupString(string(r))
		return info(v)
	}

	ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)
		x := lookup(r)
		if got, want := x.category(), catFromEntry(p); got != want {
			t.Errorf("%U:category: got %x; want %x", r, got, want)
		}

		mapped := false
		switch p.String(1) {
		case "mapped", "disallowed_STD3_mapped", "deviation":
			mapped = true
		}
		if x.isMapped() != mapped {
			t.Errorf("%U:isMapped: got %v; want %v", r, x.isMapped(), mapped)
		}
		if !mapped {
			return
		}
		want := string(p.Runes(2))
		got := string(x.appendMapping(nil, string(r)))
		if got != want {
			t.Errorf("%U:mapping: got %+q; want %+q", r, got, want)
		}

		if x.isMapped() {
			return
		}
		wantMark := unicode.In(r, unicode.Mark)
		gotMark := x.isModifier()
		if gotMark != wantMark {
			t.Errorf("IsMark(%U) = %v; want %v", r, gotMark, wantMark)
		}
	})

	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)
		x := lookup(r)
		got := x.isViramaModifier()

		const cccVirama = 9
		want := p.Int(ucd.CanonicalCombiningClass) == cccVirama
		if got != want {
			t.Errorf("IsVirama(%U) = %v; want %v", r, got, want)
		}
	})

	ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)
		x := lookup(r)
		if x.isMapped() {
			return
		}
		got := x.joinType()
		want := joinType[p.String(1)]
		if got != want {
			t.Errorf("JoinType(%U) = %x; want %x", r, got, want)
		}
	})
}
Beispiel #4
0
func TestConformance(t *testing.T) {
	testtext.SkipIfNotLong(t)

	r := gen.OpenUnicodeFile("idna", "", "IdnaTest.txt")
	defer r.Close()

	section := "main"
	started := false
	p := ucd.New(r, ucd.CommentHandler(func(s string) {
		if started {
			section = strings.ToLower(strings.Split(s, " ")[0])
		}
	}))
	transitional := New(Transitional(true), VerifyDNSLength(true))
	nonTransitional := New(VerifyDNSLength(true))
	for p.Next() {
		started = true

		// What to test
		profiles := []*Profile{}
		switch p.String(0) {
		case "T":
			profiles = append(profiles, transitional)
		case "N":
			profiles = append(profiles, nonTransitional)
		case "B":
			profiles = append(profiles, transitional)
			profiles = append(profiles, nonTransitional)
		}

		src := unescape(p.String(1))

		wantToUnicode := unescape(p.String(2))
		if wantToUnicode == "" {
			wantToUnicode = src
		}
		wantToASCII := unescape(p.String(3))
		if wantToASCII == "" {
			wantToASCII = wantToUnicode
		}
		wantErrToUnicode := ""
		if strings.HasPrefix(wantToUnicode, "[") {
			wantErrToUnicode = wantToUnicode
			wantToUnicode = ""
		}
		wantErrToASCII := ""
		if strings.HasPrefix(wantToASCII, "[") {
			wantErrToASCII = wantToASCII
			wantToASCII = ""
		}

		// TODO: also do IDNA tests.
		// invalidInIDNA2008 := p.String(4) == "NV8"

		for _, p := range profiles {
			name := fmt.Sprintf("%s:%s", section, p)
			doTest(t, p.ToUnicode, name+":ToUnicode", src, wantToUnicode, wantErrToUnicode)
			doTest(t, p.ToASCII, name+":ToASCII", src, wantToASCII, wantErrToASCII)
		}
	}
}
Beispiel #5
0
func genTables() {
	t := triegen.NewTrie("idna")

	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)

		const cccVirama = 9
		if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
			runes[p.Rune(0)] = viramaModifier
		}
		switch {
		case unicode.In(r, unicode.Mark):
			runes[r] |= modifier
		}
	})

	ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
		switch v := p.String(1); v {
		case "L", "D", "T", "R":
			runes[p.Rune(0)] |= joinType[v] << joinShift
		}
	})

	ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)

		// The mappings table explicitly defines surrogates as invalid.
		if !utf8.ValidRune(r) {
			return
		}

		cat := catFromEntry(p)
		isMapped := cat == mapped || cat == disallowedSTD3Mapped || cat == deviation
		if !isMapped {
			// Only include additional category information for non-mapped
			// runes. The additional information is only used after mapping and
			// the bits would clash with mapping information.
			// TODO: it would be possible to inline this data and avoid
			// additional lookups. This is quite tedious, though, so let's first
			// see if we need this.
			cat |= category(runes[r])
		}

		s := string(p.Runes(2))
		if s != "" && !isMapped {
			log.Fatalf("%U: Mapping with non-mapping category %d", r, cat)
		}
		t.Insert(r, uint64(makeEntry(r, s))+uint64(cat))
	})

	w := gen.NewCodeWriter()
	defer w.WriteGoFile("tables.go", "idna")

	gen.WriteUnicodeVersion(w)

	w.WriteVar("mappings", string(mappings))
	w.WriteVar("xorData", string(xorData))

	sz, err := t.Gen(w, triegen.Compact(&normCompacter{}))
	if err != nil {
		log.Fatal(err)
	}
	w.Size += sz
}
Beispiel #6
0
func TestConformance(t *testing.T) {
	testtext.SkipIfNotLong(t)

	r := gen.OpenUnicodeFile("idna", "", "IdnaTest.txt")
	defer r.Close()

	section := "main"
	started := false
	p := ucd.New(r, ucd.CommentHandler(func(s string) {
		if started {
			section = strings.ToLower(strings.Split(s, " ")[0])
		}
	}))
	for p.Next() {
		started = true

		// What to test
		profiles := []*Profile{}
		switch p.String(0) {
		case "T":
			profiles = append(profiles, Transitional)
		case "N":
			profiles = append(profiles, NonTransitional)
		case "B":
			profiles = append(profiles, Transitional)
			profiles = append(profiles, NonTransitional)
		}

		src := unescape(p.String(1))
		if incorrectTests[src] {
			continue
		}

		wantToUnicode := unescape(p.String(2))
		if wantToUnicode == "" {
			wantToUnicode = src
		}
		wantToASCII := unescape(p.String(3))
		if wantToASCII == "" {
			wantToASCII = wantToUnicode
		}
		test := "err:"
		if strings.HasPrefix(wantToUnicode, "[") {
			test += strings.Replace(strings.Trim(wantToUnicode, "[]"), " ", "", -1)
		}
		if strings.HasPrefix(wantToASCII, "[") {
			test += strings.Replace(strings.Trim(wantToASCII, "[]"), " ", "", -1)
		}
		if test == "err:" {
			test = "ok"
		}

		// TODO: also do IDNA tests.
		// invalidInIDNA2008 := p.String(4) == "NV8"

		for _, p := range profiles {
			testtext.Run(t, fmt.Sprintf("%s:%s/%s/%+q", section, test, p, src), func(t *testing.T) {
				got, err := p.ToUnicode(src)
				wantErr := strings.HasPrefix(wantToUnicode, "[")
				gotErr := err != nil
				if wantErr {
					if gotErr != wantErr {
						t.Errorf(`ToUnicode:err got %v; want %v (%s)`,
							gotErr, wantErr, wantToUnicode)
					}
				} else if got != wantToUnicode || gotErr != wantErr {
					t.Errorf(`ToUnicode: got %+q, %v (%v); want %+q, %v`,
						got, gotErr, err, wantToUnicode, wantErr)
				}

				got, err = p.ToASCII(src)
				wantErr = strings.HasPrefix(wantToASCII, "[")
				gotErr = err != nil
				if wantErr {
					if gotErr != wantErr {
						t.Errorf(`ToASCII:err got %v; want %v (%s)`,
							gotErr, wantErr, wantToASCII)
					}
				} else if got != wantToASCII || gotErr != wantErr {
					t.Errorf(`ToASCII: got %+q, %v (%v); want %+q, %v`,
						got, gotErr, err, wantToASCII, wantErr)
				}
			})
		}
	}
}