// getWidthData calls f for every entry for which it is defined.
//
// f may be called multiple times for the same rune. The last call to f is the
// correct value. f is not called for all runes. The default tag type is
// Neutral.
func getWidthData(f func(r rune, tag elem, alt rune)) {
	// Set the default values for Unified Ideographs. In line with Annex 11,
	// we encode full ranges instead of the defined runes in Unified_Ideograph.
	for _, b := range []struct{ lo, hi rune }{
		{0x4E00, 0x9FFF},   // the CJK Unified Ideographs block,
		{0x3400, 0x4DBF},   // the CJK Unified Ideographs Externsion A block,
		{0xF900, 0xFAFF},   // the CJK Compatibility Ideographs block,
		{0x20000, 0x2FFFF}, // the Supplementary Ideographic Plane,
		{0x30000, 0x3FFFF}, // the Tertiary Ideographic Plane,
	} {
		for r := b.lo; r <= b.hi; r++ {
			f(r, tagWide, 0)
		}
	}

	inverse := map[rune]rune{}
	maps := map[string]bool{
		"<wide>":   true,
		"<narrow>": true,
	}

	// We cannot reuse package norm's decomposition, as we need an unexpanded
	// decomposition. We make use of the opportunity to verify that the
	// decomposition type is as expected.
	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)
		s := strings.SplitN(p.String(ucd.DecompMapping), " ", 2)
		if !maps[s[0]] {
			return
		}
		x, err := strconv.ParseUint(s[1], 16, 32)
		if err != nil {
			log.Fatalf("Error parsing rune %q", s[1])
		}
		if inverse[r] != 0 || inverse[rune(x)] != 0 {
			log.Fatalf("Circular dependency in mapping between %U and %U", r, x)
		}
		inverse[r] = rune(x)
		inverse[rune(x)] = r
	})

	// <rune range>;<type>
	ucd.Parse(gen.OpenUCDFile("EastAsianWidth.txt"), func(p *ucd.Parser) {
		tag, ok := typeMap[p.String(1)]
		if !ok {
			log.Fatalf("Unknown width type %q", p.String(1))
		}
		r := p.Rune(0)
		alt, ok := inverse[r]
		if tag == tagFullwidth || tag == tagHalfwidth && r != wonSign {
			tag |= tagNeedsFold
			if !ok {
				log.Fatalf("Narrow or wide rune %U has no decomposition", r)
			}
		}
		f(r, tag, alt)
	})
}
Example #2
0
// These tables are hand-extracted from:
// http://www.unicode.org/Public/8.0.0/ucd/extracted/DerivedBidiClass.txt
func visitDefaults(fn func(r rune, c Class)) {
	// first write default values for ranges listed above.
	visitRunes(fn, AL, []rune{
		0x0600, 0x07BF, // Arabic
		0x08A0, 0x08FF, // Arabic Extended-A
		0xFB50, 0xFDCF, // Arabic Presentation Forms
		0xFDF0, 0xFDFF,
		0xFE70, 0xFEFF,
		0x0001EE00, 0x0001EEFF, // Arabic Mathematical Alpha Symbols
	})
	visitRunes(fn, R, []rune{
		0x0590, 0x05FF, // Hebrew
		0x07C0, 0x089F, // Nko et al.
		0xFB1D, 0xFB4F,
		0x00010800, 0x00010FFF, // Cypriot Syllabary et. al.
		0x0001E800, 0x0001EDFF,
		0x0001EF00, 0x0001EFFF,
	})
	visitRunes(fn, ET, []rune{ // European Terminator
		0x20A0, 0x20Cf, // Currency symbols
	})
	rangetable.Visit(unicode.Noncharacter_Code_Point, func(r rune) {
		fn(r, BN) // Boundary Neutral
	})
	ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) {
		if p.String(1) == "Default_Ignorable_Code_Point" {
			fn(p.Rune(0), BN) // Boundary Neutral
		}
	})
}
Example #3
0
func TestName(t *testing.T) {
	testtext.SkipIfNotLong(t)

	wants := make([]string, 1+unicode.MaxRune)
	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
		r, s := p.Rune(0), p.String(ucd.Name)
		if s == "" {
			return
		}
		if s[0] == '<' {
			const first = ", First>"
			if i := strings.Index(s, first); i >= 0 {
				s = s[:i] + ">"
			}
		}
		wants[r] = s
	})

	nErrors := 0
	for r, want := range wants {
		got := Name(rune(r))
		if got != want {
			t.Errorf("r=%#08x: got %q, want %q", r, got, want)
			nErrors++
			if nErrors == 100 {
				t.Fatal("too many errors")
			}
		}
	}
}
Example #4
0
func TestTables(t *testing.T) {
	testtext.SkipIfNotLong(t)

	ucd.Parse(gen.OpenUCDFile("BidiBrackets.txt"), func(p *ucd.Parser) {
		r1 := p.Rune(0)
		want := p.Rune(1)

		e, _ := LookupRune(r1)
		if got := e.reverseBracket(r1); got != want {
			t.Errorf("Reverse(%U) = %U; want %U", r1, got, want)
		}
	})

	done := map[rune]bool{}
	test := func(name string, r rune, want string) {
		str := string(r)
		e, _ := LookupString(str)
		if got := labels[e.Class()]; got != want {
			t.Errorf("%s:%U: got %s; want %s", name, r, got, want)
		}
		if e2, sz := LookupRune(r); e != e2 || sz != len(str) {
			t.Errorf("LookupRune(%U) = %v, %d; want %v, %d", r, e2, e, sz, len(str))
		}
		if e2, sz := Lookup([]byte(str)); e != e2 || sz != len(str) {
			t.Errorf("Lookup(%U) = %v, %d; want %v, %d", r, e2, e, sz, len(str))
		}
		done[r] = true
	}

	// Insert the derived BiDi properties.
	ucd.Parse(gen.OpenUCDFile("extracted/DerivedBidiClass.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)
		test("derived", r, p.String(1))
	})
	visitDefaults(func(r rune, c Class) {
		if !done[r] {
			test("default", r, labels[c])
		}
	})

}
Example #5
0
func TestTables(t *testing.T) {
	if !*long {
		return
	}

	gen.Init()

	trie := newBidiTrie(0)

	ucd.Parse(gen.OpenUCDFile("BidiBrackets.txt"), func(p *ucd.Parser) {
		r1 := p.Rune(0)
		want := p.Rune(1)

		e, _ := trie.lookupString(string(r1))
		if got := entry(e).reverseBracket(r1); got != want {
			t.Errorf("Reverse(%U) = %U; want %U", r1, got, want)
		}
	})

	done := map[rune]bool{}
	test := func(name string, r rune, want string) {
		e, _ := trie.lookupString(string(r))
		if got := labels[entry(e).class(r)]; got != want {
			t.Errorf("%s:%U: got %s; want %s", name, r, got, want)
		}
		done[r] = true
	}

	// Insert the derived BiDi properties.
	ucd.Parse(gen.OpenUCDFile("extracted/DerivedBidiClass.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)
		test("derived", r, p.String(1))
	})
	visitDefaults(func(r rune, c class) {
		if !done[r] {
			test("default", r, labels[c])
		}
	})

}
Example #6
0
func main() {
	gen.Init()

	// Load data
	runes := []rune{}
	ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) {
		if p.String(1) == "Default_Ignorable_Code_Point" {
			runes = append(runes, p.Rune(0))
		}
	})
	ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) {
		if p.String(1) == "LVT" {
			runes = append(runes, p.Rune(0))
		}
	})

	disallowedRunes = rangetable.New(runes...)
	assigned = rangetable.Assigned(unicode.Version)

	writeTables()
	gen.Repackage("gen_trieval.go", "trieval.go", "precis")
}
Example #7
0
func main() {
	gen.Init()

	versions := getVersions()

	w := &bytes.Buffer{}

	fmt.Fprintf(w, "//go:generate go run gen.go --versions=%s\n\n", strings.Join(versions, ","))
	fmt.Fprintf(w, "import \"unicode\"\n\n")

	vstr := func(s string) string { return strings.Replace(s, ".", "_", -1) }

	fmt.Fprintf(w, "var assigned = map[string]*unicode.RangeTable{\n")
	for _, v := range versions {
		fmt.Fprintf(w, "\t%q: assigned%s,\n", v, vstr(v))
	}
	fmt.Fprintf(w, "}\n\n")

	var size int
	for _, v := range versions {
		assigned := []rune{}

		r := gen.Open("http://www.unicode.org/Public/", "", v+"/ucd/UnicodeData.txt")
		ucd.Parse(r, func(p *ucd.Parser) {
			assigned = append(assigned, p.Rune(0))
		})

		rt := rangetable.New(assigned...)
		sz := int(reflect.TypeOf(unicode.RangeTable{}).Size())
		sz += int(reflect.TypeOf(unicode.Range16{}).Size()) * len(rt.R16)
		sz += int(reflect.TypeOf(unicode.Range32{}).Size()) * len(rt.R32)

		fmt.Fprintf(w, "// size %d bytes (%d KiB)\n", sz, sz/1024)
		fmt.Fprintf(w, "var assigned%s = ", vstr(v))
		print(w, rt)

		size += sz
	}

	fmt.Fprintf(w, "// Total size %d bytes (%d KiB)\n", size, size/1024)

	gen.WriteGoFile("tables.go", "rangetable", w.Bytes())
}
Example #8
0
func parse() (names []string, counts map[string]int) {
	names = make([]string, 1+unicode.MaxRune)
	counts = map[string]int{}
	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
		r, s := p.Rune(0), p.String(ucd.Name)
		if s == "" {
			return
		}
		if s[0] == '<' {
			const first = ", First>"
			if i := strings.Index(s, first); i >= 0 {
				s = s[:i] + ">"
			}
		}
		names[r] = s
		counts[s]++
	})
	return names, counts
}
// TestBidiCharacters performs the tests in BidiCharacterTest.txt.
// See http://www.unicode.org/Public/UCD/latest/ucd/BidiCharacterTest.txt
func TestBidiCharacters(t *testing.T) {
	testtext.SkipIfNotLong(t)

	ucd.Parse(gen.OpenUCDFile("BidiCharacterTest.txt"), func(p *ucd.Parser) {
		var (
			types      []class
			pairTypes  []bracketType
			pairValues []rune
			parLevel   level

			wantLevel       = level(p.Int(2))
			wantLevels      = p.Strings(3)
			wantVisualOrder = p.Strings(4)
		)

		switch l := p.Int(1); l {
		case 0, 1:
			parLevel = level(l)
		case 2:
			parLevel = implicitLevel
		default:
			// Spec says to ignore unknown parts.
		}

		trie := newBidiTrie(0)
		runes := p.Runes(0)

		for _, r := range runes {
			// Assign the bracket type.
			if d := norm.NFKD.PropertiesString(string(r)).Decomposition(); d != nil {
				r = []rune(string(d))[0]
			}
			e, _ := trie.lookupString(string(r))
			entry := entry(e)

			// Assign the class for this rune.
			types = append(types, entry.class(r))

			switch {
			case !entry.isBracket():
				pairTypes = append(pairTypes, bpNone)
				pairValues = append(pairValues, 0)
			case entry.isOpen():
				pairTypes = append(pairTypes, bpOpen)
				pairValues = append(pairValues, r)
			default:
				pairTypes = append(pairTypes, bpClose)
				pairValues = append(pairValues, entry.reverseBracket(r))
			}
		}
		par := newParagraph(types, pairTypes, pairValues, parLevel)

		// Test results:
		if got := par.embeddingLevel; got != wantLevel {
			t.Errorf("%v:level: got %d; want %d", string(runes), got, wantLevel)
		}

		if *testLevels {
			gotLevels := getLevelStrings(types, par.resultLevels)
			if got, want := fmt.Sprint(gotLevels), fmt.Sprint(wantLevels); got != want {
				t.Errorf("%04X %q:%d: got %v; want %v\nval: %x\npair: %v", runes, string(runes), parLevel, got, want, pairValues, pairTypes)
			}
		}

		order := par.getReordering([]int{len(types)})
		order = filterOrder(types, order)
		if got, want := fmt.Sprint(order), fmt.Sprint(wantVisualOrder); got != want {
			t.Errorf("%04X %q:%d: got %v; want %v\ngot order: %s", runes, string(runes), parLevel, got, want, reorder(runes, order))
		}
	})
}
Example #10
0
func TestTables(t *testing.T) {
	testtext.SkipIfNotLong(t)

	lookup := func(r rune) info {
		v, _ := trie.lookupString(string(r))
		return info(v)
	}

	ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)
		x := lookup(r)
		if got, want := x.category(), catFromEntry(p); got != want {
			t.Errorf("%U:category: got %x; want %x", r, got, want)
		}

		mapped := false
		switch p.String(1) {
		case "mapped", "disallowed_STD3_mapped", "deviation":
			mapped = true
		}
		if x.isMapped() != mapped {
			t.Errorf("%U:isMapped: got %v; want %v", r, x.isMapped(), mapped)
		}
		if !mapped {
			return
		}
		want := string(p.Runes(2))
		got := string(x.appendMapping(nil, string(r)))
		if got != want {
			t.Errorf("%U:mapping: got %+q; want %+q", r, got, want)
		}

		if x.isMapped() {
			return
		}
		wantMark := unicode.In(r, unicode.Mark)
		gotMark := x.isModifier()
		if gotMark != wantMark {
			t.Errorf("IsMark(%U) = %v; want %v", r, gotMark, wantMark)
		}
	})

	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)
		x := lookup(r)
		got := x.isViramaModifier()

		const cccVirama = 9
		want := p.Int(ucd.CanonicalCombiningClass) == cccVirama
		if got != want {
			t.Errorf("IsVirama(%U) = %v; want %v", r, got, want)
		}
	})

	ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)
		x := lookup(r)
		if x.isMapped() {
			return
		}
		got := x.joinType()
		want := joinType[p.String(1)]
		if got != want {
			t.Errorf("JoinType(%U) = %x; want %x", r, got, want)
		}
	})
}
Example #11
0
func parse(file string, f func(p *ucd.Parser)) {
	ucd.Parse(gen.OpenUCDFile(file), f)
}
Example #12
0
func genTables() {
	t := triegen.NewTrie("idna")

	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)

		const cccVirama = 9
		if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
			runes[p.Rune(0)] = viramaModifier
		}
		switch {
		case unicode.In(r, unicode.Mark):
			runes[r] |= modifier
		}
	})

	ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
		switch v := p.String(1); v {
		case "L", "D", "T", "R":
			runes[p.Rune(0)] |= joinType[v] << joinShift
		}
	})

	ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)

		// The mappings table explicitly defines surrogates as invalid.
		if !utf8.ValidRune(r) {
			return
		}

		cat := catFromEntry(p)
		isMapped := cat == mapped || cat == disallowedSTD3Mapped || cat == deviation
		if !isMapped {
			// Only include additional category information for non-mapped
			// runes. The additional information is only used after mapping and
			// the bits would clash with mapping information.
			// TODO: it would be possible to inline this data and avoid
			// additional lookups. This is quite tedious, though, so let's first
			// see if we need this.
			cat |= category(runes[r])
		}

		s := string(p.Runes(2))
		if s != "" && !isMapped {
			log.Fatalf("%U: Mapping with non-mapping category %d", r, cat)
		}
		t.Insert(r, uint64(makeEntry(r, s))+uint64(cat))
	})

	w := gen.NewCodeWriter()
	defer w.WriteGoFile("tables.go", "idna")

	gen.WriteUnicodeVersion(w)

	w.WriteVar("mappings", string(mappings))
	w.WriteVar("xorData", string(xorData))

	sz, err := t.Gen(w, triegen.Compact(&normCompacter{}))
	if err != nil {
		log.Fatal(err)
	}
	w.Size += sz
}
Example #13
0
func main() {
	gen.Init()

	// Load data
	runes := []rune{}
	// PrecisIgnorableProperties: https://tools.ietf.org/html/rfc7564#section-9.13
	ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) {
		if p.String(1) == "Default_Ignorable_Code_Point" {
			runes = append(runes, p.Rune(0))
		}
	})
	ucd.Parse(gen.OpenUCDFile("PropList.txt"), func(p *ucd.Parser) {
		switch p.String(1) {
		case "Noncharacter_Code_Point":
			runes = append(runes, p.Rune(0))
		}
	})
	// OldHangulJamo: https://tools.ietf.org/html/rfc5892#section-2.9
	ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) {
		switch p.String(1) {
		case "L", "V", "T":
			runes = append(runes, p.Rune(0))
		}
	})

	disallowedRunes = rangetable.New(runes...)
	assigned = rangetable.Assigned(unicode.Version)

	// Load category data.
	runeCategory['l'] = latinSmallL
	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
		const cccVirama = 9
		if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
			setCategory(p.Rune(0), viramaModifier)
		}
	})
	ucd.Parse(gen.OpenUCDFile("Scripts.txt"), func(p *ucd.Parser) {
		switch p.String(1) {
		case "Greek":
			setCategory(p.Rune(0), greek)
		case "Hebrew":
			setCategory(p.Rune(0), hebrew)
		case "Hiragana", "Katakana", "Han":
			setCategory(p.Rune(0), japanese)
		}
	})

	// Set the rule categories associated with exceptions. This overrides any
	// previously set categories. The original categories are manually
	// reintroduced in the categoryTransitions table.
	for r, e := range exceptions {
		if e.cat != 0 {
			runeCategory[r] = e.cat
		}
	}
	cat := map[string]category{
		"L": joiningL,
		"D": joiningD,
		"T": joiningT,

		"R": joiningR,
	}
	ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
		switch v := p.String(1); v {
		case "L", "D", "T", "R":
			setCategory(p.Rune(0), cat[v])
		}
	})

	writeTables()
	gen.Repackage("gen_trieval.go", "trieval.go", "precis")
}
Example #14
0
func genTables() {
	if numClass > 0x0F {
		log.Fatalf("Too many Class constants (%#x > 0x0F).", numClass)
	}
	w := gen.NewCodeWriter()
	defer w.WriteGoFile(*outputFile, "bidi")

	gen.WriteUnicodeVersion(w)

	t := triegen.NewTrie("bidi")

	// Build data about bracket mapping. These bits need to be or-ed with
	// any other bits.
	orMask := map[rune]uint64{}

	xorMap := map[rune]int{}
	xorMasks := []rune{0} // First value is no-op.

	ucd.Parse(gen.OpenUCDFile("BidiBrackets.txt"), func(p *ucd.Parser) {
		r1 := p.Rune(0)
		r2 := p.Rune(1)
		xor := r1 ^ r2
		if _, ok := xorMap[xor]; !ok {
			xorMap[xor] = len(xorMasks)
			xorMasks = append(xorMasks, xor)
		}
		entry := uint64(xorMap[xor]) << xorMaskShift
		switch p.String(2) {
		case "o":
			entry |= openMask
		case "c", "n":
		default:
			log.Fatalf("Unknown bracket class %q.", p.String(2))
		}
		orMask[r1] = entry
	})

	w.WriteComment(`
	xorMasks contains masks to be xor-ed with brackets to get the reverse 
	version.`)
	w.WriteVar("xorMasks", xorMasks)

	done := map[rune]bool{}

	insert := func(r rune, c class) {
		if !done[r] {
			t.Insert(r, orMask[r]|uint64(c))
			done[r] = true
		}
	}

	// Insert the derived BiDi properties.
	ucd.Parse(gen.OpenUCDFile("extracted/DerivedBidiClass.txt"), func(p *ucd.Parser) {
		r := p.Rune(0)
		class, ok := bidiClass[p.String(1)]
		if !ok {
			log.Fatalf("%U: Unknown BiDi class %q", r, p.String(1))
		}
		insert(r, class)
	})
	visitDefaults(insert)

	// TODO: use sparse blocks. This would reduce table size considerably
	// from the looks of it.

	sz, err := t.Gen(w)
	if err != nil {
		log.Fatal(err)
	}
	w.Size += sz
}