Beispiel #1
0
// Ensure that ceratain properties were generated correctly.
func TestTable(t *testing.T) {
	tests := []tableTest{
		tableTest{
			rangetable.Merge(
				unicode.Lt, unicode.Nl, unicode.No, // Other letter digits
				unicode.Me,             // Modifiers
				unicode.Zs,             // Spaces
				unicode.So,             // Symbols
				unicode.Pi, unicode.Pf, // Punctuation
			),
			idDisOrFreePVal,
		},
		tableTest{
			rangetable.New(0x30000, 0x30101, 0xDFFFF),
			unassigned,
		},
	}

	assigned := rangetable.Assigned(UnicodeVersion)

	for _, test := range tests {
		rangetable.Visit(test.rangeTable, func(r rune) {
			if !unicode.In(r, assigned) {
				return
			}
			b := make([]byte, 4)
			n := utf8.EncodeRune(b, r)
			trieval, _ := dpTrie.lookup(b[:n])
			p := entry(trieval).property()
			if p != test.prop && !exceptions.Contains(r) {
				t.Errorf("%U: got %+x; want %+x", r, test.prop, p)
			}
		})
	}
}
Beispiel #2
0
func main() {
	gen.Init()

	versions := getVersions()

	w := &bytes.Buffer{}

	fmt.Fprintf(w, "//go:generate go run gen.go --versions=%s\n\n", strings.Join(versions, ","))
	fmt.Fprintf(w, "import \"unicode\"\n\n")

	vstr := func(s string) string { return strings.Replace(s, ".", "_", -1) }

	fmt.Fprintf(w, "var assigned = map[string]*unicode.RangeTable{\n")
	for _, v := range versions {
		fmt.Fprintf(w, "\t%q: assigned%s,\n", v, vstr(v))
	}
	fmt.Fprintf(w, "}\n\n")

	var size int
	for _, v := range versions {
		assigned := []rune{}

		r := gen.Open("http://www.unicode.org/Public/", "", v+"/ucd/UnicodeData.txt")
		ucd.Parse(r, func(p *ucd.Parser) {
			assigned = append(assigned, p.Rune(0))
		})

		rt := rangetable.New(assigned...)
		sz := int(reflect.TypeOf(unicode.RangeTable{}).Size())
		sz += int(reflect.TypeOf(unicode.Range16{}).Size()) * len(rt.R16)
		sz += int(reflect.TypeOf(unicode.Range32{}).Size()) * len(rt.R32)

		fmt.Fprintf(w, "// size %d bytes (%d KiB)\n", sz, sz/1024)
		fmt.Fprintf(w, "var assigned%s = ", vstr(v))
		print(w, rt)

		size += sz
	}

	fmt.Fprintf(w, "// Total size %d bytes (%d KiB)\n", size, size/1024)

	gen.WriteGoFile("tables.go", "rangetable", w.Bytes())
}
// Ensure that ceratain properties were generated correctly.
func TestTable(t *testing.T) {
	tests := []tableTest{
		tableTest{
			rangetable.Merge(
				unicode.Lt, unicode.Nl, unicode.No, // Other letter digits
				unicode.Me,             // Modifiers
				unicode.Zs,             // Spaces
				unicode.So,             // Symbols
				unicode.Pi, unicode.Pf, // Punctuation
			),
			freePVal | idDis,
		},
		tableTest{
			rangetable.New(0x30000, 0x30101, 0xDFFFF),
			unassigned,
		},
	}

	for _, test := range tests {
		test.run(t)
	}
}
Beispiel #4
0
func main() {
	gen.Init()

	// Load data
	runes := []rune{}
	ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) {
		if p.String(1) == "Default_Ignorable_Code_Point" {
			runes = append(runes, p.Rune(0))
		}
	})
	ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) {
		if p.String(1) == "LVT" {
			runes = append(runes, p.Rune(0))
		}
	})

	disallowedRunes = rangetable.New(runes...)
	assigned = rangetable.Assigned(unicode.Version)

	writeTables()
	gen.Repackage("gen_trieval.go", "trieval.go", "precis")
}
Beispiel #5
0
func main() {
	gen.Init()

	// Load data
	runes := []rune{}
	// PrecisIgnorableProperties: https://tools.ietf.org/html/rfc7564#section-9.13
	ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) {
		if p.String(1) == "Default_Ignorable_Code_Point" {
			runes = append(runes, p.Rune(0))
		}
	})
	ucd.Parse(gen.OpenUCDFile("PropList.txt"), func(p *ucd.Parser) {
		switch p.String(1) {
		case "Noncharacter_Code_Point":
			runes = append(runes, p.Rune(0))
		}
	})
	// OldHangulJamo: https://tools.ietf.org/html/rfc5892#section-2.9
	ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) {
		switch p.String(1) {
		case "L", "V", "T":
			runes = append(runes, p.Rune(0))
		}
	})

	disallowedRunes = rangetable.New(runes...)
	assigned = rangetable.Assigned(unicode.Version)

	// Load category data.
	runeCategory['l'] = latinSmallL
	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
		const cccVirama = 9
		if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
			setCategory(p.Rune(0), viramaModifier)
		}
	})
	ucd.Parse(gen.OpenUCDFile("Scripts.txt"), func(p *ucd.Parser) {
		switch p.String(1) {
		case "Greek":
			setCategory(p.Rune(0), greek)
		case "Hebrew":
			setCategory(p.Rune(0), hebrew)
		case "Hiragana", "Katakana", "Han":
			setCategory(p.Rune(0), japanese)
		}
	})

	// Set the rule categories associated with exceptions. This overrides any
	// previously set categories. The original categories are manually
	// reintroduced in the categoryTransitions table.
	for r, e := range exceptions {
		if e.cat != 0 {
			runeCategory[r] = e.cat
		}
	}
	cat := map[string]category{
		"L": joiningL,
		"D": joiningD,
		"T": joiningT,

		"R": joiningR,
	}
	ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
		switch v := p.String(1); v {
		case "L", "D", "T", "R":
			setCategory(p.Rune(0), cat[v])
		}
	})

	writeTables()
	gen.Repackage("gen_trieval.go", "trieval.go", "precis")
}
	"bytes"
	"unicode"
	"unicode/utf8"

	"github.com/ReanGD/go-web-search/werrors"

	"golang.org/x/net/html"
	"golang.org/x/net/html/atom"
	"golang.org/x/text/unicode/rangetable"
)

var notSeparatorRT = rangetable.New(
	'&', '-', '@', '_', '+', '\'',
	'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
	'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
	'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
	'w', 'x', 'y', 'z',
	'а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й',
	'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф',
	'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я')

type minificationText struct {
}

func (m *minificationText) isDigit(prev, cur, next rune) bool {
	return (cur == '.' || cur == ',' || cur == ':') &&
		('0' <= prev && prev <= '9') &&
		('0' <= next && next <= '9')
}

func (m *minificationText) processText(in string) string {