// getWidthData calls f for every entry for which it is defined.
//
// f may be called multiple times for the same rune. The last call to f is the
// correct value. f is not called for all runes. The default tag type is
// Neutral.
func getWidthData(f func(r rune, tag elem, alt rune)) {
	// Set the default values for Unified Ideographs. In line with Annex 11,
	// we encode full ranges instead of the defined runes in Unified_Ideograph.
	for _, b := range []struct{ lo, hi rune }{
		{0x4E00, 0x9FFF},   // the CJK Unified Ideographs block,
		{0x3400, 0x4DBF},   // the CJK Unified Ideographs Externsion A block,
		{0xF900, 0xFAFF},   // the CJK Compatibility Ideographs block,
		{0x20000, 0x2FFFF}, // the Supplementary Ideographic Plane,
		{0x30000, 0x3FFFF}, // the Tertiary Ideographic Plane,
	} {
		for r := b.lo; r <= b.hi; r++ {
			f(r, tagWide, 0)
		}
	}

	inverse := map[rune]rune{}
	maps := map[string]bool{
		"<wide>":   true,
		"<narrow>": true,
	}

	// We cannot reuse package norm's decomposition, as we need an unexpanded
	// decomposition. We make use of the opportunity to verify that the
	// decomposition type is as expected.
	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)
		s := strings.SplitN(p.String(ucd.DecompMapping), " ", 2)
		if !maps[s[0]] {
			return
		}
		x, err := strconv.ParseUint(s[1], 16, 32)
		if err != nil {
			log.Fatalf("Error parsing rune %q", s[1])
		}
		if inverse[r] != 0 || inverse[rune(x)] != 0 {
			log.Fatalf("Circular dependency in mapping between %U and %U", r, x)
		}
		inverse[r] = rune(x)
		inverse[rune(x)] = r
	})

	// <rune range>;<type>
	ucd.Parse(gen.OpenUCDFile("EastAsianWidth.txt"), func(p *ucd.Parser) {
		tag, ok := typeMap[p.String(1)]
		if !ok {
			log.Fatalf("Unknown width type %q", p.String(1))
		}
		r := p.Rune(0)
		alt, ok := inverse[r]
		if tag == tagFullwidth || tag == tagHalfwidth && r != wonSign {
			tag |= tagNeedsFold
			if !ok {
				log.Fatalf("Narrow or wide rune %U has no decomposition", r)
			}
		}
		f(r, tag, alt)
	})
}
Пример #2
0
func TestName(t *testing.T) {
	testtext.SkipIfNotLong(t)

	wants := make([]string, 1+unicode.MaxRune)
	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
		r, s := p.Rune(0), p.String(ucd.Name)
		if s == "" {
			return
		}
		if s[0] == '<' {
			const first = ", First>"
			if i := strings.Index(s, first); i >= 0 {
				s = s[:i] + ">"
			}
		}
		wants[r] = s
	})

	nErrors := 0
	for r, want := range wants {
		got := Name(rune(r))
		if got != want {
			t.Errorf("r=%#08x: got %q, want %q", r, got, want)
			nErrors++
			if nErrors == 100 {
				t.Fatal("too many errors")
			}
		}
	}
}
Пример #3
0
// These tables are hand-extracted from:
// http://www.unicode.org/Public/8.0.0/ucd/extracted/DerivedBidiClass.txt
func visitDefaults(fn func(r rune, c Class)) {
	// first write default values for ranges listed above.
	visitRunes(fn, AL, []rune{
		0x0600, 0x07BF, // Arabic
		0x08A0, 0x08FF, // Arabic Extended-A
		0xFB50, 0xFDCF, // Arabic Presentation Forms
		0xFDF0, 0xFDFF,
		0xFE70, 0xFEFF,
		0x0001EE00, 0x0001EEFF, // Arabic Mathematical Alpha Symbols
	})
	visitRunes(fn, R, []rune{
		0x0590, 0x05FF, // Hebrew
		0x07C0, 0x089F, // Nko et al.
		0xFB1D, 0xFB4F,
		0x00010800, 0x00010FFF, // Cypriot Syllabary et. al.
		0x0001E800, 0x0001EDFF,
		0x0001EF00, 0x0001EFFF,
	})
	visitRunes(fn, ET, []rune{ // European Terminator
		0x20A0, 0x20Cf, // Currency symbols
	})
	rangetable.Visit(unicode.Noncharacter_Code_Point, func(r rune) {
		fn(r, BN) // Boundary Neutral
	})
	ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) {
		if p.String(1) == "Default_Ignorable_Code_Point" {
			fn(p.Rune(0), BN) // Boundary Neutral
		}
	})
}
Пример #4
0
// TestBidiCore performs the tests in BidiTest.txt.
// See http://www.unicode.org/Public/UCD/latest/ucd/BidiTest.txt.
func TestBidiCore(t *testing.T) {
	testtext.SkipIfNotLong(t)

	r := gen.OpenUCDFile("BidiTest.txt")
	defer r.Close()

	var wantLevels, wantOrder []string
	p := ucd.New(r, ucd.Part(func(p *ucd.Parser) {
		s := strings.Split(p.String(0), ":")
		switch s[0] {
		case "Levels":
			wantLevels = strings.Fields(s[1])
		case "Reorder":
			wantOrder = strings.Fields(s[1])
		default:
			log.Fatalf("Unknown part %q.", s[0])
		}
	}))

	for p.Next() {
		types := []class{}
		for _, s := range p.Strings(0) {
			types = append(types, bidiClass[s])
		}
		// We ignore the bracketing part of the algorithm.
		pairTypes := make([]bracketType, len(types))
		pairValues := make([]rune, len(types))

		for i := uint(0); i < 3; i++ {
			if p.Uint(1)&(1<<i) == 0 {
				continue
			}
			lev := level(int(i) - 1)
			par := newParagraph(types, pairTypes, pairValues, lev)

			if *testLevels {
				levels := par.resultLevels
				for i, s := range wantLevels {
					if s == "x" {
						continue
					}
					l, _ := strconv.ParseUint(s, 10, 8)
					if level(l)&1 != levels[i]&1 {
						t.Errorf("%s:%d:levels: got %v; want %v", p.String(0), lev, levels, wantLevels)
						break
					}
				}
			}

			order := par.getReordering([]int{len(types)})
			gotOrder := filterOrder(types, order)
			if got, want := fmt.Sprint(gotOrder), fmt.Sprint(wantOrder); got != want {
				t.Errorf("%s:%d:order: got %v; want %v\noriginal %v", p.String(0), lev, got, want, order)
			}
		}
	}
	if err := p.Err(); err != nil {
		log.Fatal(err)
	}
}
Пример #5
0
func parse(path string, f func(p *ucd.Parser)) {
	r := gen.OpenUCDFile(path)
	defer r.Close()
	p := ucd.New(r)
	for p.Next() {
		f(p)
	}
	if err := p.Err(); err != nil {
		log.Fatal(err)
	}
}
Пример #6
0
// parse calls f for each entry in the given UCD file.
func (opts ucdParser) parse(filename string, f func(p *ucd.Parser)) {
	r := gen.OpenUCDFile(filename)
	defer r.Close()
	p := ucd.New(r, opts...)
	for p.Next() {
		f(p)
	}
	if err := p.Err(); err != nil {
		log.Fatal(err)
	}
}
Пример #7
0
func TestTables(t *testing.T) {
	testtext.SkipIfNotLong(t)

	ucd.Parse(gen.OpenUCDFile("BidiBrackets.txt"), func(p *ucd.Parser) {
		r1 := p.Rune(0)
		want := p.Rune(1)

		e, _ := LookupRune(r1)
		if got := e.reverseBracket(r1); got != want {
			t.Errorf("Reverse(%U) = %U; want %U", r1, got, want)
		}
	})

	done := map[rune]bool{}
	test := func(name string, r rune, want string) {
		str := string(r)
		e, _ := LookupString(str)
		if got := labels[e.Class()]; got != want {
			t.Errorf("%s:%U: got %s; want %s", name, r, got, want)
		}
		if e2, sz := LookupRune(r); e != e2 || sz != len(str) {
			t.Errorf("LookupRune(%U) = %v, %d; want %v, %d", r, e2, e, sz, len(str))
		}
		if e2, sz := Lookup([]byte(str)); e != e2 || sz != len(str) {
			t.Errorf("Lookup(%U) = %v, %d; want %v, %d", r, e2, e, sz, len(str))
		}
		done[r] = true
	}

	// Insert the derived BiDi properties.
	ucd.Parse(gen.OpenUCDFile("extracted/DerivedBidiClass.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)
		test("derived", r, p.String(1))
	})
	visitDefaults(func(r rune, c Class) {
		if !done[r] {
			test("default", r, labels[c])
		}
	})

}
Пример #8
0
// Use values in DerivedNormalizationProps.txt to compare against the
// values we computed.
// DerivedNormalizationProps.txt has form:
// 00C0..00C5    ; NFD_QC; N # ...
// 0374          ; NFD_QC; N # ...
// See http://unicode.org/reports/tr44/ for full explanation
func testDerived() {
	f := gen.OpenUCDFile("DerivedNormalizationProps.txt")
	defer f.Close()
	p := ucd.New(f)
	for p.Next() {
		r := p.Rune(0)
		c := &chars[r]

		var ftype, mode int
		qt := p.String(1)
		switch qt {
		case "NFC_QC":
			ftype, mode = FCanonical, MComposed
		case "NFD_QC":
			ftype, mode = FCanonical, MDecomposed
		case "NFKC_QC":
			ftype, mode = FCompatibility, MComposed
		case "NFKD_QC":
			ftype, mode = FCompatibility, MDecomposed
		default:
			continue
		}
		var qr QCResult
		switch p.String(2) {
		case "Y":
			qr = QCYes
		case "N":
			qr = QCNo
		case "M":
			qr = QCMaybe
		default:
			log.Fatalf(`Unexpected quick check value "%s"`, p.String(2))
		}
		if got := c.forms[ftype].quickCheck[mode]; got != qr {
			log.Printf("%U: FAILED %s (was %v need %v)\n", r, qt, got, qr)
		}
		c.forms[ftype].verified[mode] = true
	}
	if err := p.Err(); err != nil {
		log.Fatal(err)
	}
	// Any unspecified value must be QCYes. Verify this.
	for i, c := range chars {
		for j, fd := range c.forms {
			for k, qr := range fd.quickCheck {
				if !fd.verified[k] && qr != QCYes {
					m := "%U: FAIL F:%d M:%d (was %v need Yes) %s\n"
					log.Printf(m, i, j, k, qr, c.name)
				}
			}
		}
	}
}
Пример #9
0
func TestTables(t *testing.T) {
	if !*long {
		return
	}

	gen.Init()

	trie := newBidiTrie(0)

	ucd.Parse(gen.OpenUCDFile("BidiBrackets.txt"), func(p *ucd.Parser) {
		r1 := p.Rune(0)
		want := p.Rune(1)

		e, _ := trie.lookupString(string(r1))
		if got := entry(e).reverseBracket(r1); got != want {
			t.Errorf("Reverse(%U) = %U; want %U", r1, got, want)
		}
	})

	done := map[rune]bool{}
	test := func(name string, r rune, want string) {
		e, _ := trie.lookupString(string(r))
		if got := labels[entry(e).class(r)]; got != want {
			t.Errorf("%s:%U: got %s; want %s", name, r, got, want)
		}
		done[r] = true
	}

	// Insert the derived BiDi properties.
	ucd.Parse(gen.OpenUCDFile("extracted/DerivedBidiClass.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)
		test("derived", r, p.String(1))
	})
	visitDefaults(func(r rune, c class) {
		if !done[r] {
			test("default", r, labels[c])
		}
	})

}
Пример #10
0
func main() {
	gen.Init()

	// Load data
	runes := []rune{}
	ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) {
		if p.String(1) == "Default_Ignorable_Code_Point" {
			runes = append(runes, p.Rune(0))
		}
	})
	ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) {
		if p.String(1) == "LVT" {
			runes = append(runes, p.Rune(0))
		}
	})

	disallowedRunes = rangetable.New(runes...)
	assigned = rangetable.Assigned(unicode.Version)

	writeTables()
	gen.Repackage("gen_trieval.go", "trieval.go", "precis")
}
Пример #11
0
// Load the data form NormalizationTest.txt
func loadTestData(t *testing.T) {
	f := gen.OpenUCDFile("NormalizationTest.txt")
	defer f.Close()
	scanner := bufio.NewScanner(f)
	for scanner.Scan() {
		line := scanner.Text()
		if len(line) == 0 || line[0] == '#' {
			continue
		}
		m := partRe.FindStringSubmatch(line)
		if m != nil {
			if len(m) < 3 {
				t.Fatal("Failed to parse Part: ", line)
			}
			i, err := strconv.Atoi(m[1])
			if err != nil {
				t.Fatal(err)
			}
			name := m[2]
			part = append(part, Part{name: name[:len(name)-1], number: i})
			continue
		}
		m = testRe.FindStringSubmatch(line)
		if m == nil || len(m) < 7 {
			t.Fatalf(`Failed to parse: "%s" result: %#v`, line, m)
		}
		test := Test{name: m[6], partnr: len(part) - 1, number: counter}
		counter++
		for j := 1; j < len(m)-1; j++ {
			for _, split := range strings.Split(m[j], " ") {
				r, err := strconv.ParseUint(split, 16, 64)
				if err != nil {
					t.Fatal(err)
				}
				if test.r == 0 {
					// save for CharacterByCharacterTests
					test.r = rune(r)
				}
				var buf [utf8.UTFMax]byte
				sz := utf8.EncodeRune(buf[:], rune(r))
				test.cols[j-1] += string(buf[:sz])
			}
		}
		part := &part[len(part)-1]
		part.tests = append(part.tests, test)
	}
	if scanner.Err() != nil {
		t.Fatal(scanner.Err())
	}
}
Пример #12
0
// CompositionExclusions.txt has form:
// 0958    # ...
// See http://unicode.org/reports/tr44/ for full explanation
func loadCompositionExclusions() {
	f := gen.OpenUCDFile("CompositionExclusions.txt")
	defer f.Close()
	p := ucd.New(f)
	for p.Next() {
		c := &chars[p.Rune(0)]
		if c.excludeInComp {
			log.Fatalf("%U: Duplicate entry in exclusions.", c.codePoint)
		}
		c.excludeInComp = true
	}
	if e := p.Err(); e != nil {
		log.Fatal(e)
	}
}
Пример #13
0
func parse() (names []string, counts map[string]int) {
	names = make([]string, 1+unicode.MaxRune)
	counts = map[string]int{}
	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
		r, s := p.Rune(0), p.String(ucd.Name)
		if s == "" {
			return
		}
		if s[0] == '<' {
			const first = ", First>"
			if i := strings.Index(s, first); i >= 0 {
				s = s[:i] + ">"
			}
		}
		names[r] = s
		counts[s]++
	})
	return names, counts
}
Пример #14
0
func loadUnicodeData() {
	f := gen.OpenUCDFile("UnicodeData.txt")
	defer f.Close()
	p := ucd.New(f)
	for p.Next() {
		r := p.Rune(ucd.CodePoint)
		char := &chars[r]

		char.ccc = uint8(p.Uint(ucd.CanonicalCombiningClass))
		decmap := p.String(ucd.DecompMapping)

		exp, err := parseDecomposition(decmap, false)
		isCompat := false
		if err != nil {
			if len(decmap) > 0 {
				exp, err = parseDecomposition(decmap, true)
				if err != nil {
					log.Fatalf(`%U: bad decomp |%v|: "%s"`, r, decmap, err)
				}
				isCompat = true
			}
		}

		char.name = p.String(ucd.Name)
		char.codePoint = r
		char.forms[FCompatibility].decomp = exp
		if !isCompat {
			char.forms[FCanonical].decomp = exp
		} else {
			char.compatDecomp = true
		}
		if len(decmap) > 0 {
			char.forms[FCompatibility].decomp = exp
		}
	}
	if err := p.Err(); err != nil {
		log.Fatal(err)
	}
}
Пример #15
0
// TestBidiCharacters performs the tests in BidiCharacterTest.txt.
// See http://www.unicode.org/Public/UCD/latest/ucd/BidiCharacterTest.txt
func TestBidiCharacters(t *testing.T) {
	testtext.SkipIfNotLong(t)

	ucd.Parse(gen.OpenUCDFile("BidiCharacterTest.txt"), func(p *ucd.Parser) {
		var (
			types      []class
			pairTypes  []bracketType
			pairValues []rune
			parLevel   level

			wantLevel       = level(p.Int(2))
			wantLevels      = p.Strings(3)
			wantVisualOrder = p.Strings(4)
		)

		switch l := p.Int(1); l {
		case 0, 1:
			parLevel = level(l)
		case 2:
			parLevel = implicitLevel
		default:
			// Spec says to ignore unknown parts.
		}

		trie := newBidiTrie(0)
		runes := p.Runes(0)

		for _, r := range runes {
			// Assign the bracket type.
			if d := norm.NFKD.PropertiesString(string(r)).Decomposition(); d != nil {
				r = []rune(string(d))[0]
			}
			e, _ := trie.lookupString(string(r))
			entry := entry(e)

			// Assign the class for this rune.
			types = append(types, entry.class(r))

			switch {
			case !entry.isBracket():
				pairTypes = append(pairTypes, bpNone)
				pairValues = append(pairValues, 0)
			case entry.isOpen():
				pairTypes = append(pairTypes, bpOpen)
				pairValues = append(pairValues, r)
			default:
				pairTypes = append(pairTypes, bpClose)
				pairValues = append(pairValues, entry.reverseBracket(r))
			}
		}
		par := newParagraph(types, pairTypes, pairValues, parLevel)

		// Test results:
		if got := par.embeddingLevel; got != wantLevel {
			t.Errorf("%v:level: got %d; want %d", string(runes), got, wantLevel)
		}

		if *testLevels {
			gotLevels := getLevelStrings(types, par.resultLevels)
			if got, want := fmt.Sprint(gotLevels), fmt.Sprint(wantLevels); got != want {
				t.Errorf("%04X %q:%d: got %v; want %v\nval: %x\npair: %v", runes, string(runes), parLevel, got, want, pairValues, pairTypes)
			}
		}

		order := par.getReordering([]int{len(types)})
		order = filterOrder(types, order)
		if got, want := fmt.Sprint(order), fmt.Sprint(wantVisualOrder); got != want {
			t.Errorf("%04X %q:%d: got %v; want %v\ngot order: %s", runes, string(runes), parLevel, got, want, reorder(runes, order))
		}
	})
}
Пример #16
0
func genTables() {
	if numClass > 0x0F {
		log.Fatalf("Too many Class constants (%#x > 0x0F).", numClass)
	}
	w := gen.NewCodeWriter()
	defer w.WriteGoFile(*outputFile, "bidi")

	gen.WriteUnicodeVersion(w)

	t := triegen.NewTrie("bidi")

	// Build data about bracket mapping. These bits need to be or-ed with
	// any other bits.
	orMask := map[rune]uint64{}

	xorMap := map[rune]int{}
	xorMasks := []rune{0} // First value is no-op.

	ucd.Parse(gen.OpenUCDFile("BidiBrackets.txt"), func(p *ucd.Parser) {
		r1 := p.Rune(0)
		r2 := p.Rune(1)
		xor := r1 ^ r2
		if _, ok := xorMap[xor]; !ok {
			xorMap[xor] = len(xorMasks)
			xorMasks = append(xorMasks, xor)
		}
		entry := uint64(xorMap[xor]) << xorMaskShift
		switch p.String(2) {
		case "o":
			entry |= openMask
		case "c", "n":
		default:
			log.Fatalf("Unknown bracket class %q.", p.String(2))
		}
		orMask[r1] = entry
	})

	w.WriteComment(`
	xorMasks contains masks to be xor-ed with brackets to get the reverse 
	version.`)
	w.WriteVar("xorMasks", xorMasks)

	done := map[rune]bool{}

	insert := func(r rune, c class) {
		if !done[r] {
			t.Insert(r, orMask[r]|uint64(c))
			done[r] = true
		}
	}

	// Insert the derived BiDi properties.
	ucd.Parse(gen.OpenUCDFile("extracted/DerivedBidiClass.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)
		class, ok := bidiClass[p.String(1)]
		if !ok {
			log.Fatalf("%U: Unknown BiDi class %q", r, p.String(1))
		}
		insert(r, class)
	})
	visitDefaults(insert)

	// TODO: use sparse blocks. This would reduce table size considerably
	// from the looks of it.

	sz, err := t.Gen(w)
	if err != nil {
		log.Fatal(err)
	}
	w.Size += sz
}
Пример #17
0
func TestTables(t *testing.T) {
	testtext.SkipIfNotLong(t)

	lookup := func(r rune) info {
		v, _ := trie.lookupString(string(r))
		return info(v)
	}

	ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)
		x := lookup(r)
		if got, want := x.category(), catFromEntry(p); got != want {
			t.Errorf("%U:category: got %x; want %x", r, got, want)
		}

		mapped := false
		switch p.String(1) {
		case "mapped", "disallowed_STD3_mapped", "deviation":
			mapped = true
		}
		if x.isMapped() != mapped {
			t.Errorf("%U:isMapped: got %v; want %v", r, x.isMapped(), mapped)
		}
		if !mapped {
			return
		}
		want := string(p.Runes(2))
		got := string(x.appendMapping(nil, string(r)))
		if got != want {
			t.Errorf("%U:mapping: got %+q; want %+q", r, got, want)
		}

		if x.isMapped() {
			return
		}
		wantMark := unicode.In(r, unicode.Mark)
		gotMark := x.isModifier()
		if gotMark != wantMark {
			t.Errorf("IsMark(%U) = %v; want %v", r, gotMark, wantMark)
		}
	})

	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)
		x := lookup(r)
		got := x.isViramaModifier()

		const cccVirama = 9
		want := p.Int(ucd.CanonicalCombiningClass) == cccVirama
		if got != want {
			t.Errorf("IsVirama(%U) = %v; want %v", r, got, want)
		}
	})

	ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)
		x := lookup(r)
		if x.isMapped() {
			return
		}
		got := x.joinType()
		want := joinType[p.String(1)]
		if got != want {
			t.Errorf("JoinType(%U) = %x; want %x", r, got, want)
		}
	})
}
Пример #18
0
func parse(file string, f func(p *ucd.Parser)) {
	ucd.Parse(gen.OpenUCDFile(file), f)
}
Пример #19
0
func main() {
	gen.Init()

	// Load data
	runes := []rune{}
	// PrecisIgnorableProperties: https://tools.ietf.org/html/rfc7564#section-9.13
	ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) {
		if p.String(1) == "Default_Ignorable_Code_Point" {
			runes = append(runes, p.Rune(0))
		}
	})
	ucd.Parse(gen.OpenUCDFile("PropList.txt"), func(p *ucd.Parser) {
		switch p.String(1) {
		case "Noncharacter_Code_Point":
			runes = append(runes, p.Rune(0))
		}
	})
	// OldHangulJamo: https://tools.ietf.org/html/rfc5892#section-2.9
	ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) {
		switch p.String(1) {
		case "L", "V", "T":
			runes = append(runes, p.Rune(0))
		}
	})

	disallowedRunes = rangetable.New(runes...)
	assigned = rangetable.Assigned(unicode.Version)

	// Load category data.
	runeCategory['l'] = latinSmallL
	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
		const cccVirama = 9
		if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
			setCategory(p.Rune(0), viramaModifier)
		}
	})
	ucd.Parse(gen.OpenUCDFile("Scripts.txt"), func(p *ucd.Parser) {
		switch p.String(1) {
		case "Greek":
			setCategory(p.Rune(0), greek)
		case "Hebrew":
			setCategory(p.Rune(0), hebrew)
		case "Hiragana", "Katakana", "Han":
			setCategory(p.Rune(0), japanese)
		}
	})

	// Set the rule categories associated with exceptions. This overrides any
	// previously set categories. The original categories are manually
	// reintroduced in the categoryTransitions table.
	for r, e := range exceptions {
		if e.cat != 0 {
			runeCategory[r] = e.cat
		}
	}
	cat := map[string]category{
		"L": joiningL,
		"D": joiningD,
		"T": joiningT,

		"R": joiningR,
	}
	ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
		switch v := p.String(1); v {
		case "L", "D", "T", "R":
			setCategory(p.Rune(0), cat[v])
		}
	})

	writeTables()
	gen.Repackage("gen_trieval.go", "trieval.go", "precis")
}
Пример #20
0
func genTables() {
	t := triegen.NewTrie("idna")

	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)

		const cccVirama = 9
		if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
			runes[p.Rune(0)] = viramaModifier
		}
		switch {
		case unicode.In(r, unicode.Mark):
			runes[r] |= modifier
		}
	})

	ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
		switch v := p.String(1); v {
		case "L", "D", "T", "R":
			runes[p.Rune(0)] |= joinType[v] << joinShift
		}
	})

	ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)

		// The mappings table explicitly defines surrogates as invalid.
		if !utf8.ValidRune(r) {
			return
		}

		cat := catFromEntry(p)
		isMapped := cat == mapped || cat == disallowedSTD3Mapped || cat == deviation
		if !isMapped {
			// Only include additional category information for non-mapped
			// runes. The additional information is only used after mapping and
			// the bits would clash with mapping information.
			// TODO: it would be possible to inline this data and avoid
			// additional lookups. This is quite tedious, though, so let's first
			// see if we need this.
			cat |= category(runes[r])
		}

		s := string(p.Runes(2))
		if s != "" && !isMapped {
			log.Fatalf("%U: Mapping with non-mapping category %d", r, cat)
		}
		t.Insert(r, uint64(makeEntry(r, s))+uint64(cat))
	})

	w := gen.NewCodeWriter()
	defer w.WriteGoFile("tables.go", "idna")

	gen.WriteUnicodeVersion(w)

	w.WriteVar("mappings", string(mappings))
	w.WriteVar("xorData", string(xorData))

	sz, err := t.Gen(w, triegen.Compact(&normCompacter{}))
	if err != nil {
		log.Fatal(err)
	}
	w.Size += sz
}