Beispiel #1
0
// push pushes the regexp re onto the parse stack and returns the regexp.
func (p *parser) push(re *Regexp) *Regexp {
	if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
		// Single rune.
		if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
			return nil
		}
		re.Op = OpLiteral
		re.Rune = re.Rune[:1]
		re.Flags = p.flags &^ FoldCase
	} else if re.Op == OpCharClass && len(re.Rune) == 4 &&
		re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
		unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
		unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
		re.Op == OpCharClass && len(re.Rune) == 2 &&
			re.Rune[0]+1 == re.Rune[1] &&
			unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
			unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
		// Case-insensitive rune like [Aa] or [Δδ].
		if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
			return nil
		}

		// Rewrite as (case-insensitive) literal.
		re.Op = OpLiteral
		re.Rune = re.Rune[:1]
		re.Flags = p.flags | FoldCase
	} else {
		// Incremental concatenation.
		p.maybeConcat(-1, 0)
	}

	p.stack = append(p.stack, re)
	return re
}
Beispiel #2
0
// appendFoldedRange returns the result of appending the range lo-hi
// and its case folding-equivalent runes to the class r.
func appendFoldedRange(r []int, lo, hi int) []int {
	// Optimizations.
	if lo <= MinFold && hi >= MaxFold {
		// Range is full: folding can't add more.
		return AppendRange(r, lo, hi)
	}
	if hi < MinFold || lo > MaxFold {
		// Range is outside folding possibilities.
		return AppendRange(r, lo, hi)
	}
	if lo < MinFold {
		// [lo, MinFold-1] needs no folding.
		r = AppendRange(r, lo, MinFold-1)
		lo = MinFold
	}
	if hi > MaxFold {
		// [MaxFold+1, hi] needs no folding.
		r = AppendRange(r, MaxFold+1, hi)
		hi = MaxFold
	}

	// Brute force.  Depend on AppendRange to coalesce ranges on the fly.
	for c := lo; c <= hi; c++ {
		r = AppendRange(r, c, c)
		f := unicode.SimpleFold(c)
		for f != c {
			r = AppendRange(r, f, f)
			f = unicode.SimpleFold(f)
		}
	}
	return r
}
Beispiel #3
0
// equalFoldRune compares a and b runes whether they fold equally.
//
// The code comes from strings.EqualFold, but shortened to only one rune.
func equalFoldRune(sr, tr rune) bool {
	if sr == tr {
		return true
	}
	// Make sr < tr to simplify what follows.
	if tr < sr {
		sr, tr = tr, sr
	}
	// Fast check for ASCII.
	if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' {
		// ASCII, and sr is upper case.  tr must be lower case.
		if tr == sr+'a'-'A' {
			return true
		}
		return false
	}

	// General case.  SimpleFold(x) returns the next equivalent rune > x
	// or wraps around to smaller values.
	r := unicode.SimpleFold(sr)
	for r != sr && r < tr {
		r = unicode.SimpleFold(r)
	}
	if r == tr {
		return true
	}
	return false
}
Beispiel #4
0
// appendFoldedRange returns the result of appending the range lo-hi
// and its case folding-equivalent runes to the class r.
func appendFoldedRange(r []rune, lo, hi rune) []rune {
	// Optimizations.
	if lo <= minFold && hi >= maxFold {
		// Range is full: folding can't add more.
		return appendRange(r, lo, hi)
	}
	if hi < minFold || lo > maxFold {
		// Range is outside folding possibilities.
		return appendRange(r, lo, hi)
	}
	if lo < minFold {
		// [lo, minFold-1] needs no folding.
		r = appendRange(r, lo, minFold-1)
		lo = minFold
	}
	if hi > maxFold {
		// [maxFold+1, hi] needs no folding.
		r = appendRange(r, maxFold+1, hi)
		hi = maxFold
	}

	// Brute force.  Depend on appendRange to coalesce ranges on the fly.
	for c := lo; c <= hi; c++ {
		r = appendRange(r, c, c)
		f := unicode.SimpleFold(c)
		for f != c {
			r = appendRange(r, f, f)
			f = unicode.SimpleFold(f)
		}
	}
	return r
}
Beispiel #5
0
// Iterates through all versions (casings etc.) of a rune and compares to the
// other rune. A generalized case insensitive compare.
func EqualRuneFold(a, b rune) float32 {
	if a == b {
		return 0.0
	}
	for c := unicode.SimpleFold(a); c != a; c = unicode.SimpleFold(c) {
		//fmt.Println("Compare", c, "and", b)
		if c == b {
			return 0.0
		}
	}
	return 1.0
}
Beispiel #6
0
// EqualFold reports whether s and t, interpreted as UTF-8 strings,
// are equal under Unicode case-folding.
func EqualFold(s, t string) bool {
	for s != "" && t != "" {
		// Extract first rune from each string.
		var sr, tr rune
		if s[0] < utf8.RuneSelf {
			sr, s = rune(s[0]), s[1:]
		} else {
			r, size := utf8.DecodeRuneInString(s)
			sr, s = r, s[size:]
		}
		if t[0] < utf8.RuneSelf {
			tr, t = rune(t[0]), t[1:]
		} else {
			r, size := utf8.DecodeRuneInString(t)
			tr, t = r, t[size:]
		}

		// If they match, keep going; if not, return false.

		// Easy case.
		if tr == sr {
			continue
		}

		// Make sr < tr to simplify what follows.
		if tr < sr {
			tr, sr = sr, tr
		}
		// Fast check for ASCII.
		if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' {
			// ASCII, and sr is upper case.  tr must be lower case.
			if tr == sr+'a'-'A' {
				continue
			}
			return false
		}

		// General case.  SimpleFold(x) returns the next equivalent rune > x
		// or wraps around to smaller values.
		r := unicode.SimpleFold(sr)
		for r != sr && r < tr {
			r = unicode.SimpleFold(r)
		}
		if r == tr {
			continue
		}
		return false
	}

	// One string is empty.  Are both?
	return s == t
}
Beispiel #7
0
func asciiFold(r rune) bool {
	if r >= utf8.RuneSelf {
		return false
	}
	r1 := unicode.SimpleFold(r)
	if r1 >= utf8.RuneSelf {
		return false
	}
	if r1 == r {
		return true
	}
	return unicode.SimpleFold(r1) == r
}
Beispiel #8
0
// minFoldRune returns the minimum rune fold-equivalent to r.
func minFoldRune(r rune) rune {
	if r < minFold || r > maxFold {
		return r
	}
	min := r
	r0 := r
	for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
		if min > r {
			min = r
		}
	}
	return min
}
Beispiel #9
0
func isUpperFold(rune int) bool {
	if unicode.IsUpper(rune) {
		return true
	}
	c := unicode.SimpleFold(rune)
	for c != rune {
		if unicode.IsUpper(c) {
			return true
		}
		c = unicode.SimpleFold(c)
	}
	return false
}
Beispiel #10
0
// minFoldRune returns the minimum rune fold-equivalent to r.
func minFoldRune(r int) int {
	if r < MinFold || r > MaxFold {
		return r
	}
	min := r
	r0 := r
	for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
		if min > r {
			min = r
		}
	}
	return min
}
Beispiel #11
0
// rewrite takes a sequence of strings in, adds variants of the these strings
// based on options and removes duplicates.
func (r *rewriter) rewrite(ss []string) []string {
	ns := []string{}
	for _, s := range ss {
		ns = r.insert(ns, s)
		if r.addCases {
			rs := []rune(s)
			rn := rs[0]
			for c := unicode.SimpleFold(rn); c != rn; c = unicode.SimpleFold(c) {
				rs[0] = c
				ns = r.insert(ns, string(rs))
			}
		}
	}
	return ns
}
func TestCaseProperties(t *testing.T) {
	assigned := rangetable.Assigned(UnicodeVersion)
	coreVersion := rangetable.Assigned(unicode.Version)
	for r := rune(0); r <= lastRuneForTesting; r++ {
		if !unicode.In(r, assigned) || !unicode.In(r, coreVersion) {
			continue
		}
		c := contextFromRune(r)
		if got, want := c.info.isCaseIgnorable(), propIgnore(r); got != want {
			t.Errorf("caseIgnorable(%U): got %v; want %v (%x)", r, got, want, c.info)
		}
		// New letters may change case types, but existing case pairings should
		// not change. See Case Pair Stability in
		// http://unicode.org/policies/stability_policy.html.
		if rf := unicode.SimpleFold(r); rf != r && unicode.In(rf, assigned) {
			if got, want := c.info.isCased(), propCased(r); got != want {
				t.Errorf("cased(%U): got %v; want %v (%x)", r, got, want, c.info)
			}
			if got, want := c.caseType() == cUpper, propUpper(r); got != want {
				t.Errorf("upper(%U): got %v; want %v (%x)", r, got, want, c.info)
			}
			if got, want := c.caseType() == cLower, propLower(r); got != want {
				t.Errorf("lower(%U): got %v; want %v (%x)", r, got, want, c.info)
			}
		}
		if got, want := c.info.isBreak(), hasBreakProp(r); got != want {
			t.Errorf("isBreak(%U): got %v; want %v (%x)", r, got, want, c.info)
		}
	}
	// TODO: get title case from unicode file.
}
Beispiel #13
0
func printCaseOrbit() {
	if *test {
		for i := range chars {
			c := &chars[i]
			f := c.caseOrbit
			if f == 0 {
				if c.lowerCase != i && c.lowerCase != 0 {
					f = c.lowerCase
				} else if c.upperCase != i && c.upperCase != 0 {
					f = c.upperCase
				} else {
					f = i
				}
			}
			if g := unicode.SimpleFold(i); g != f {
				fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f)
			}
		}
		return
	}

	fmt.Printf("var caseOrbit = []foldPair{\n")
	for i := range chars {
		c := &chars[i]
		if c.caseOrbit != 0 {
			fmt.Printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit)
			foldPairCount++
		}
	}
	fmt.Printf("}\n\n")
}
Beispiel #14
0
func printCaseOrbit() {
	if *test {
		for j := range chars {
			i := rune(j)
			c := &chars[i]
			f := c.caseOrbit
			if f == 0 {
				if c.lowerCase != i && c.lowerCase != 0 {
					f = c.lowerCase
				} else if c.upperCase != i && c.upperCase != 0 {
					f = c.upperCase
				} else {
					f = i
				}
			}
			if g := unicode.SimpleFold(i); g != f {
				fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f)
			}
		}
		return
	}

	ppt("const fold_pair _case_orbit[] = {\n")
	for i := range chars {
		c := &chars[i]
		if c.caseOrbit != 0 {
			ppt("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit)
			foldPairCount++
		}
	}
	ppt("};\n")
	ppt("const slice<const fold_pair> case_orbit(_case_orbit);\n\n")
}
Beispiel #15
0
func TestCaseProperties(t *testing.T) {
	if unicode.Version != UnicodeVersion {
		t.Skipf("UnicodeVersion=%s, but unicode.Version=%s", UnicodeVersion, unicode.Version)
	}
	assigned := rangetable.Assigned(UnicodeVersion)
	for r := rune(0); r <= lastRuneForTesting; r++ {
		if !unicode.In(r, assigned) || !unicode.In(unicode.SimpleFold(r), assigned) {
			continue
		}
		c := contextFromRune(r)
		if got, want := c.info.isCaseIgnorable(), propIgnore(r); got != want {
			t.Errorf("caseIgnorable(%U): got %v; want %v (%x)", r, got, want, c.info)
		}
		if got, want := c.info.isCased(), propCased(r); got != want {
			t.Errorf("cased(%U): got %v; want %v (%x)", r, got, want, c.info)
		}
		if got, want := c.caseType() == cUpper, propUpper(r); got != want {
			t.Errorf("upper(%U): got %v; want %v (%x)", r, got, want, c.info)
		}
		if got, want := c.caseType() == cLower, propLower(r); got != want {
			t.Errorf("lower(%U): got %v; want %v (%x)", r, got, want, c.info)
		}
		if got, want := c.info.isBreak(), hasBreakProp(r); got != want {
			t.Errorf("isBreak(%U): got %v; want %v (%x)", r, got, want, c.info)
		}
	}
	// TODO: get title case from unicode file.
}
Beispiel #16
0
// toFold returns a string with the property that
//	strings.EqualFold(s, t) iff toFold(s) == toFold(t)
// This lets us test a large set of strings for fold-equivalent
// duplicates without making a quadratic number of calls
// to EqualFold. Note that strings.ToUpper and strings.ToLower
// have the desired property in some corner cases.
func toFold(s string) string {
	// Fast path: all ASCII, no upper case.
	// Most paths look like this already.
	for i := 0; i < len(s); i++ {
		c := s[i]
		if c >= utf8.RuneSelf || 'A' <= c && c <= 'Z' {
			goto Slow
		}
	}
	return s

Slow:
	var buf bytes.Buffer
	for _, r := range s {
		// SimpleFold(x) cycles to the next equivalent rune > x
		// or wraps around to smaller values. Iterate until it wraps,
		// and we've found the minimum value.
		for {
			r0 := r
			r = unicode.SimpleFold(r0)
			if r <= r0 {
				break
			}
		}
		// Exception to allow fast path above: A-Z => a-z
		if 'A' <= r && r <= 'Z' {
			r += 'a' - 'A'
		}
		buf.WriteRune(r)
	}
	return buf.String()
}
Beispiel #17
0
// Fold returns a range containing all the runes from the original range and all the runes that can be obtained from them by using unicode case folding. The original range is not modified.
func Fold(ranges []rune) []rune {
	if len(ranges) == 0 {
		return nil
	}

	rr := make([]rune, len(ranges))
	copy(rr, ranges)
	for i := 0; i < len(ranges); i += 2 {
		for r := ranges[i]; r <= ranges[i+1]; r++ {
			r0 := r
			for r := unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
				rr = Add(rr, r)
			}
		}
	}
	return rr
}
Beispiel #18
0
// MatchRunePos checks whether the instruction matches (and consumes) r.
// If so, MatchRunePos returns the index of the matching rune pair
// (or, when len(i.Rune) == 1, rune singleton).
// If not, MatchRunePos returns -1.
// MatchRunePos should only be called when i.Op == InstRune.
func (i *Inst) MatchRunePos(r rune) int {
	rune := i.Rune

	// Special case: single-rune slice is from literal string, not char class.
	if len(rune) == 1 {
		r0 := rune[0]
		if r == r0 {
			return 0
		}
		if Flags(i.Arg)&FoldCase != 0 {
			for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
				if r == r1 {
					return 0
				}
			}
		}
		return noMatch
	}

	// Peek at the first few pairs.
	// Should handle ASCII well.
	for j := 0; j < len(rune) && j <= 8; j += 2 {
		if r < rune[j] {
			return noMatch
		}
		if r <= rune[j+1] {
			return j / 2
		}
	}

	// Otherwise binary search.
	lo := 0
	hi := len(rune) / 2
	for lo < hi {
		m := lo + (hi-lo)/2
		if c := rune[2*m]; c <= r {
			if r <= rune[2*m+1] {
				return m
			}
			lo = m + 1
		} else {
			hi = m
		}
	}
	return noMatch
}
Beispiel #19
0
Datei: pkg.go Projekt: sreis/go
// simpleFold returns the minimum rune equivalent to r
// under Unicode-defined simple case folding.
func simpleFold(r rune) rune {
	for {
		r1 := unicode.SimpleFold(r)
		if r1 <= r {
			return r1 // wrapped around, found min
		}
		r = r1
	}
}
Beispiel #20
0
func TestMapping(t *testing.T) {
	assigned := rangetable.Assigned(UnicodeVersion)
	coreVersion := rangetable.Assigned(unicode.Version)
	if coreVersion == nil {
		coreVersion = assigned
	}
	apply := func(r rune, f func(c *context) bool) string {
		c := contextFromRune(r)
		f(c)
		return string(c.dst[:c.pDst])
	}

	for r, tt := range special {
		if got, want := apply(r, lower), tt.toLower; got != want {
			t.Errorf("lowerSpecial:(%U): got %+q; want %+q", r, got, want)
		}
		if got, want := apply(r, title), tt.toTitle; got != want {
			t.Errorf("titleSpecial:(%U): got %+q; want %+q", r, got, want)
		}
		if got, want := apply(r, upper), tt.toUpper; got != want {
			t.Errorf("upperSpecial:(%U): got %+q; want %+q", r, got, want)
		}
	}

	for r := rune(0); r <= lastRuneForTesting; r++ {
		if !unicode.In(r, assigned) || !unicode.In(r, coreVersion) {
			continue
		}
		if rf := unicode.SimpleFold(r); rf == r || !unicode.In(rf, assigned) {
			continue
		}
		if _, ok := special[r]; ok {
			continue
		}
		want := string(unicode.ToLower(r))
		if got := apply(r, lower); got != want {
			t.Errorf("lower:%q (%U): got %q %U; want %q %U", r, r, got, []rune(got), want, []rune(want))
		}

		want = string(unicode.ToUpper(r))
		if got := apply(r, upper); got != want {
			t.Errorf("upper:%q (%U): got %q %U; want %q %U", r, r, got, []rune(got), want, []rune(want))
		}

		want = string(unicode.ToTitle(r))
		if got := apply(r, title); got != want {
			t.Errorf("title:%q (%U): got %q %U; want %q %U", r, r, got, []rune(got), want, []rune(want))
		}
	}
}
Beispiel #21
0
func TestFoldConstants(t *testing.T) {
	last := -1
	for i := 0; i <= unicode.MaxRune; i++ {
		if unicode.SimpleFold(i) == i {
			continue
		}
		if last == -1 && minFold != i {
			t.Errorf("minFold=%#U should be %#U", minFold, i)
		}
		last = i
	}
	if maxFold != last {
		t.Errorf("maxFold=%#U should be %#U", maxFold, last)
	}
}
Beispiel #22
0
// ContainsFold is like strings.Contains but uses Unicode case-folding.
func ContainsFold(s, substr string) bool {
	if substr == "" {
		return true
	}
	if s == "" {
		return false
	}
	firstRune := rune(substr[0])
	if firstRune >= utf8.RuneSelf {
		firstRune, _ = utf8.DecodeRuneInString(substr)
	}
	firstLowerRune := unicode.SimpleFold(firstRune)
	for i, rune := range s {
		if len(s)-i < len(substr) {
			return false
		}
		if rune == firstLowerRune || unicode.SimpleFold(rune) == firstLowerRune {
			if HasPrefixFold(s[i:], substr) {
				return true
			}
		}
	}
	return false
}
Beispiel #23
0
// Print the symbol as a suggestion.
func suggestSymbol(tokRoot, tok, name string) (cnt int) {
	// Check if name is exported
	if !unexported {
		i := strings.LastIndex(name, ".")
		r1, _ := utf8.DecodeRuneInString(name)
		r2, _ := utf8.DecodeRuneInString(name[i+1:])
		if !unicode.IsUpper(r1) || !unicode.IsUpper(r2) {
			return 0
		}
	}

	if strings.Index(name, tok) == 0 {
		fmt.Println(tokRoot + name + " ")
	} else if !matchCase {
		// Copied from cmd/doc/pkg.go.
		simpleFold := func(r rune) rune {
			for {
				r1 := unicode.SimpleFold(r)
				if r1 <= r {
					return r1
				}
				r = r1
			}
		}

		// Copied from cmd/doc/pkg.go.
		for _, u := range tok {
			p, w := utf8.DecodeRuneInString(name)
			name = name[w:]
			if u == p {
				continue
			}
			if unicode.IsLower(u) && simpleFold(u) == simpleFold(p) {
				continue
			}
			return 0
		}
		fmt.Println(tokRoot + tok + name + " ")
	}
	return 1
}
Beispiel #24
0
func ExampleSimpleFold() {
	fmt.Printf("%#U\n", unicode.SimpleFold('A'))      // 'a'
	fmt.Printf("%#U\n", unicode.SimpleFold('a'))      // 'A'
	fmt.Printf("%#U\n", unicode.SimpleFold('K'))      // 'k'
	fmt.Printf("%#U\n", unicode.SimpleFold('k'))      // '\u212A' (Kelvin symbol, K)
	fmt.Printf("%#U\n", unicode.SimpleFold('\u212A')) // 'K'
	fmt.Printf("%#U\n", unicode.SimpleFold('1'))      // '1'

	// Output:
	// U+0061 'a'
	// U+0041 'A'
	// U+006B 'k'
	// U+212A 'K'
	// U+004B 'K'
	// U+0031 '1'
}
Beispiel #25
0
func oneByteRange(i *syntax.Inst) (lo, hi byte, fold, ok bool) {
	if i.Op == syntax.InstRune1 {
		r := i.Rune[0]
		if r < utf8.RuneSelf {
			return byte(r), byte(r), false, true
		}
	}
	if i.Op != syntax.InstRune {
		return
	}
	fold = syntax.Flags(i.Arg)&syntax.FoldCase != 0
	if len(i.Rune) == 1 || len(i.Rune) == 2 && i.Rune[0] == i.Rune[1] {
		r := i.Rune[0]
		if r >= utf8.RuneSelf {
			return
		}
		if fold && !asciiFold(r) {
			return
		}
		return byte(r), byte(r), fold, true
	}
	if len(i.Rune) == 2 && i.Rune[1] < utf8.RuneSelf {
		if fold {
			for r := i.Rune[0]; r <= i.Rune[1]; r++ {
				if asciiFold(r) {
					return
				}
			}
		}
		return byte(i.Rune[0]), byte(i.Rune[1]), fold, true
	}
	if len(i.Rune) == 4 && i.Rune[0] == i.Rune[1] && i.Rune[2] == i.Rune[3] && unicode.SimpleFold(i.Rune[0]) == i.Rune[2] && unicode.SimpleFold(i.Rune[2]) == i.Rune[0] {
		return byte(i.Rune[0]), byte(i.Rune[0]), true, true
	}

	return
}
Beispiel #26
0
func (c *compiler) rune(r []rune, flags Flags) frag {
	f := c.inst(InstRune)
	i := &c.p.Inst[f.i]
	i.Rune = r
	flags &= FoldCase // only relevant flag is FoldCase
	if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] {
		// and sometimes not even that
		flags &^= FoldCase
	}
	i.Arg = uint32(flags)
	f.out = patchList(f.i << 1)

	// Special cases for exec machine.
	switch {
	case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]):
		i.Op = InstRune1
	case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune:
		i.Op = InstRuneAny
	case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune:
		i.Op = InstRuneAnyNotNL
	}

	return f
}
Beispiel #27
0
// dumpRegexp writes an encoding of the syntax tree for the regexp re to b.
// It is used during testing to distinguish between parses that might print
// the same using re's String method.
func dumpRegexp(b *bytes.Buffer, re *Regexp) {
	if int(re.Op) >= len(opNames) || opNames[re.Op] == "" {
		fmt.Fprintf(b, "op%d", re.Op)
	} else {
		switch re.Op {
		default:
			b.WriteString(opNames[re.Op])
		case OpStar, OpPlus, OpQuest, OpRepeat:
			if re.Flags&NonGreedy != 0 {
				b.WriteByte('n')
			}
			b.WriteString(opNames[re.Op])
		case OpLiteral:
			if len(re.Rune) > 1 {
				b.WriteString("str")
			} else {
				b.WriteString("lit")
			}
			if re.Flags&FoldCase != 0 {
				for _, r := range re.Rune {
					if unicode.SimpleFold(r) != r {
						b.WriteString("fold")
						break
					}
				}
			}
		}
	}
	b.WriteByte('{')
	switch re.Op {
	case OpEndText:
		if re.Flags&WasDollar == 0 {
			b.WriteString(`\z`)
		}
	case OpLiteral:
		for _, r := range re.Rune {
			b.WriteRune(r)
		}
	case OpConcat, OpAlternate:
		for _, sub := range re.Sub {
			dumpRegexp(b, sub)
		}
	case OpStar, OpPlus, OpQuest:
		dumpRegexp(b, re.Sub[0])
	case OpRepeat:
		fmt.Fprintf(b, "%d,%d ", re.Min, re.Max)
		dumpRegexp(b, re.Sub[0])
	case OpCapture:
		if re.Name != "" {
			b.WriteString(re.Name)
			b.WriteByte(':')
		}
		dumpRegexp(b, re.Sub[0])
	case OpCharClass:
		sep := ""
		for i := 0; i < len(re.Rune); i += 2 {
			b.WriteString(sep)
			sep = " "
			lo, hi := re.Rune[i], re.Rune[i+1]
			if lo == hi {
				fmt.Fprintf(b, "%#x", lo)
			} else {
				fmt.Fprintf(b, "%#x-%#x", lo, hi)
			}
		}
	}
	b.WriteByte('}')
}
Beispiel #28
0
// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
// the match engine can always tell which branch to take. The routine may modify
// p if it is turned into a onepass Prog. If it isn't possible for this to be a
// onepass Prog, the Prog syntax.NotOnePass is returned. makeOnePass is resursive
// to the size of the Prog
func (p *Prog) makeOnePass() *Prog {
	var (
		instQueue    = newQueue(len(p.Inst))
		visitQueue   = newQueue(len(p.Inst))
		build        func(uint32, *queue)
		check        func(uint32, map[uint32]bool) bool
		onePassRunes = make([][]rune, len(p.Inst))
	)
	build = func(pc uint32, q *queue) {
		if q.contains(pc) {
			return
		}
		inst := p.Inst[pc]
		switch inst.Op {
		case InstAlt, InstAltMatch:
			q.insert(inst.Out)
			build(inst.Out, q)
			q.insert(inst.Arg)
		case InstMatch, InstFail:
		default:
			q.insert(inst.Out)
		}
	}

	// check that paths from Alt instructions are unambiguous, and rebuild the new
	// program as a onepass program
	check = func(pc uint32, m map[uint32]bool) (ok bool) {
		ok = true
		inst := &p.Inst[pc]
		if visitQueue.contains(pc) {
			return
		}
		visitQueue.insert(pc)
		switch inst.Op {
		case InstAlt, InstAltMatch:
			ok = check(inst.Out, m) && check(inst.Arg, m)
			// check no-input paths to InstMatch
			matchOut := m[inst.Out]
			matchArg := m[inst.Arg]
			if matchOut && matchArg {
				ok = false
				break
			}
			// Match on empty goes in inst.Out
			if matchArg {
				inst.Out, inst.Arg = inst.Arg, inst.Out
				matchOut, matchArg = matchArg, matchOut
			}
			if matchOut {
				m[pc] = true
				inst.Op = InstAltMatch
			}

			// build a dispatch operator from the two legs of the alt.
			onePassRunes[pc], inst.Next = mergeRuneSets(
				&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
			if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
				ok = false
				break
			}
		case InstCapture, InstNop:
			ok = check(inst.Out, m)
			m[pc] = m[inst.Out]
			// pass matching runes back through these no-ops.
			onePassRunes[pc] = append([]rune{}[:], onePassRunes[inst.Out][:]...)
			inst.Next = []uint32{}
			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
				inst.Next = append(inst.Next, inst.Out)
			}
		case InstEmptyWidth:
			ok = check(inst.Out, m)
			m[pc] = m[inst.Out]
			onePassRunes[pc] = append([]rune{}[:], onePassRunes[inst.Out][:]...)
			inst.Next = []uint32{}
			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
				inst.Next = append(inst.Next, inst.Out)
			}
		case InstMatch, InstFail:
			m[pc] = inst.Op == InstMatch
			break
		case InstRune:
			ok = check(inst.Out, m)
			m[pc] = false
			if len(inst.Next) > 0 {
				break
			}
			if len(inst.Rune) == 0 {
				onePassRunes[pc] = []rune{}[:]
				inst.Next = []uint32{inst.Out}
				break
			}
			runes := make([]rune, 0)
			if len(inst.Rune) == 1 && Flags(inst.Arg)&FoldCase != 0 {
				r0 := inst.Rune[0]
				runes = append(runes, r0, r0)
				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
					runes = append(runes, r1, r1)
				}
				sort.Sort(runeSlice(runes))
			} else {
				runes = append(runes, inst.Rune...)
			}
			onePassRunes[pc] = runes
			inst.Next = []uint32{}
			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
				inst.Next = append(inst.Next, inst.Out)
			}
			inst.Op = InstRune
		case InstRune1:
			ok = check(inst.Out, m)
			m[pc] = false
			if len(inst.Next) > 0 {
				break
			}
			runes := []rune{}[:]
			// expand case-folded runes
			if Flags(inst.Arg)&FoldCase != 0 {
				r0 := inst.Rune[0]
				runes = append(runes, r0, r0)
				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
					runes = append(runes, r1, r1)
				}
				sort.Sort(runeSlice(runes))
			} else {
				runes = append(runes, inst.Rune[0], inst.Rune[0])
			}
			onePassRunes[pc] = runes
			inst.Next = []uint32{}
			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
				inst.Next = append(inst.Next, inst.Out)
			}
			inst.Op = InstRune
		case InstRuneAny:
			ok = check(inst.Out, m)
			m[pc] = false
			if len(inst.Next) > 0 {
				break
			}
			onePassRunes[pc] = append([]rune{}[:], anyRune[:]...)
			inst.Next = []uint32{inst.Out}
		case InstRuneAnyNotNL:
			ok = check(inst.Out, m)
			m[pc] = false
			if len(inst.Next) > 0 {
				break
			}
			onePassRunes[pc] = append([]rune{}[:], anyRuneNotNL[:]...)
			inst.Next = []uint32{}
			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
				inst.Next = append(inst.Next, inst.Out)
			}
		}
		return
	}

	instQueue.clear()
	instQueue.insert(uint32(p.Start))
	m := make(map[uint32]bool, len(p.Inst))
	for !instQueue.empty() {
		pc := instQueue.next()
		inst := p.Inst[pc]
		visitQueue.clear()
		if !check(uint32(pc), m) {
			p = NotOnePass
			break
		}
		switch inst.Op {
		case InstAlt, InstAltMatch:
			instQueue.insert(inst.Out)
			instQueue.insert(inst.Arg)
		case InstCapture, InstEmptyWidth, InstNop:
			instQueue.insert(inst.Out)
		case InstMatch:
		case InstFail:
		case InstRune, InstRune1, InstRuneAny, InstRuneAnyNotNL:
		default:
		}
	}
	if p != NotOnePass {
		for i, _ := range p.Inst {
			p.Inst[i].Rune = onePassRunes[i][:]
		}
	}
	return p
}
Beispiel #29
0
// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
// the match engine can always tell which branch to take. The routine may modify
// p if it is turned into a onepass Prog. If it isn't possible for this to be a
// onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive
// to the size of the Prog.
func makeOnePass(p *onePassProg) *onePassProg {
	// If the machine is very long, it's not worth the time to check if we can use one pass.
	if len(p.Inst) >= 1000 {
		return notOnePass
	}

	var (
		instQueue    = newQueue(len(p.Inst))
		visitQueue   = newQueue(len(p.Inst))
		check        func(uint32, map[uint32]bool) bool
		onePassRunes = make([][]rune, len(p.Inst))
	)

	// check that paths from Alt instructions are unambiguous, and rebuild the new
	// program as a onepass program
	check = func(pc uint32, m map[uint32]bool) (ok bool) {
		ok = true
		inst := &p.Inst[pc]
		if visitQueue.contains(pc) {
			return
		}
		visitQueue.insert(pc)
		switch inst.Op {
		case syntax.InstAlt, syntax.InstAltMatch:
			ok = check(inst.Out, m) && check(inst.Arg, m)
			// check no-input paths to InstMatch
			matchOut := m[inst.Out]
			matchArg := m[inst.Arg]
			if matchOut && matchArg {
				ok = false
				break
			}
			// Match on empty goes in inst.Out
			if matchArg {
				inst.Out, inst.Arg = inst.Arg, inst.Out
				matchOut, matchArg = matchArg, matchOut
			}
			if matchOut {
				m[pc] = true
				inst.Op = syntax.InstAltMatch
			}

			// build a dispatch operator from the two legs of the alt.
			onePassRunes[pc], inst.Next = mergeRuneSets(
				&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
			if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
				ok = false
				break
			}
		case syntax.InstCapture, syntax.InstNop:
			ok = check(inst.Out, m)
			m[pc] = m[inst.Out]
			// pass matching runes back through these no-ops.
			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
			inst.Next = []uint32{}
			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
				inst.Next = append(inst.Next, inst.Out)
			}
		case syntax.InstEmptyWidth:
			ok = check(inst.Out, m)
			m[pc] = m[inst.Out]
			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
			inst.Next = []uint32{}
			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
				inst.Next = append(inst.Next, inst.Out)
			}
		case syntax.InstMatch, syntax.InstFail:
			m[pc] = inst.Op == syntax.InstMatch
			break
		case syntax.InstRune:
			m[pc] = false
			if len(inst.Next) > 0 {
				break
			}
			instQueue.insert(inst.Out)
			if len(inst.Rune) == 0 {
				onePassRunes[pc] = []rune{}
				inst.Next = []uint32{inst.Out}
				break
			}
			runes := make([]rune, 0)
			if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
				r0 := inst.Rune[0]
				runes = append(runes, r0, r0)
				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
					runes = append(runes, r1, r1)
				}
				sort.Sort(runeSlice(runes))
			} else {
				runes = append(runes, inst.Rune...)
			}
			onePassRunes[pc] = runes
			inst.Next = []uint32{}
			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
				inst.Next = append(inst.Next, inst.Out)
			}
			inst.Op = syntax.InstRune
		case syntax.InstRune1:
			m[pc] = false
			if len(inst.Next) > 0 {
				break
			}
			instQueue.insert(inst.Out)
			runes := []rune{}
			// expand case-folded runes
			if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
				r0 := inst.Rune[0]
				runes = append(runes, r0, r0)
				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
					runes = append(runes, r1, r1)
				}
				sort.Sort(runeSlice(runes))
			} else {
				runes = append(runes, inst.Rune[0], inst.Rune[0])
			}
			onePassRunes[pc] = runes
			inst.Next = []uint32{}
			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
				inst.Next = append(inst.Next, inst.Out)
			}
			inst.Op = syntax.InstRune
		case syntax.InstRuneAny:
			m[pc] = false
			if len(inst.Next) > 0 {
				break
			}
			instQueue.insert(inst.Out)
			onePassRunes[pc] = append([]rune{}, anyRune...)
			inst.Next = []uint32{inst.Out}
		case syntax.InstRuneAnyNotNL:
			m[pc] = false
			if len(inst.Next) > 0 {
				break
			}
			instQueue.insert(inst.Out)
			onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
			inst.Next = []uint32{}
			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
				inst.Next = append(inst.Next, inst.Out)
			}
		}
		return
	}

	instQueue.clear()
	instQueue.insert(uint32(p.Start))
	m := make(map[uint32]bool, len(p.Inst))
	for !instQueue.empty() {
		visitQueue.clear()
		pc := instQueue.next()
		if !check(uint32(pc), m) {
			p = notOnePass
			break
		}
	}
	if p != notOnePass {
		for i := range p.Inst {
			p.Inst[i].Rune = onePassRunes[i]
		}
	}
	return p
}
Beispiel #30
0
// analyze returns the regexpInfo for the regexp re.
func analyze(re *syntax.Regexp) (ret regexpInfo) {
	var info regexpInfo
	switch re.Op {
	case syntax.OpNoMatch:
		return noMatch()

	case syntax.OpEmptyMatch,
		syntax.OpBeginLine, syntax.OpEndLine,
		syntax.OpBeginText, syntax.OpEndText,
		syntax.OpWordBoundary, syntax.OpNoWordBoundary:
		return emptyString()

	case syntax.OpLiteral:
		if re.Flags&syntax.FoldCase != 0 {
			switch len(re.Rune) {
			case 0:
				return emptyString()
			case 1:
				// Single-letter case-folded string:
				// rewrite into char class and analyze.
				re1 := &syntax.Regexp{
					Op: syntax.OpCharClass,
				}
				re1.Rune = re1.Rune0[:0]
				r0 := re.Rune[0]
				re1.Rune = append(re1.Rune, r0, r0)
				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
					re1.Rune = append(re1.Rune, r1, r1)
				}
				info = analyze(re1)
				return info
			}
			// Multi-letter case-folded string:
			// treat as concatenation of single-letter case-folded strings.
			re1 := &syntax.Regexp{
				Op:    syntax.OpLiteral,
				Flags: syntax.FoldCase,
			}
			info = emptyString()
			for i := range re.Rune {
				re1.Rune = re.Rune[i : i+1]
				info = concat(info, analyze(re1))
			}
			return info
		}
		info.exact = stringSet{string(re.Rune)}

	case syntax.OpAnyCharNotNL, syntax.OpAnyChar:
		return anyChar()

	case syntax.OpCapture:
		return analyze(re.Sub[0])

	case syntax.OpConcat:
		return fold(concat, re.Sub, emptyString())

	case syntax.OpAlternate:
		return fold(alternate, re.Sub, noMatch())

	case syntax.OpQuest:
		return alternate(analyze(re.Sub[0]), emptyString())

	case syntax.OpStar:
		// We don't know anything, so assume the worst.
		return anyMatch()

	case syntax.OpRepeat:
		if re.Min == 0 {
			// Like OpStar
			return anyMatch()
		}
		fallthrough
	case syntax.OpPlus:
		// x+
		// Since there has to be at least one x, the prefixes and suffixes
		// stay the same.  If x was exact, it isn't anymore.
		info = analyze(re.Sub[0])
		if info.exact.have() {
			info.prefix = info.exact
			info.suffix = info.exact.copy()
			info.exact = nil
		}

	case syntax.OpCharClass:
		// Special case.
		if len(re.Rune) == 0 {
			return noMatch()
		}

		// Special case.
		if len(re.Rune) == 1 {
			info.exact = stringSet{string(re.Rune[0])}
			break
		}

		n := 0
		for i := 0; i < len(re.Rune); i += 2 {
			n += int(re.Rune[i+1] - re.Rune[i])
		}
		// If the class is too large, it's okay to overestimate.
		if n > 100 {
			return anyChar()
		}

		info.exact = []string{}
		for i := 0; i < len(re.Rune); i += 2 {
			lo, hi := re.Rune[i], re.Rune[i+1]
			for rr := lo; rr <= hi; rr++ {
				info.exact.add(string(rr))
			}
		}
	}

	info.simplify(false)
	return info
}