// push pushes the regexp re onto the parse stack and returns the regexp. func (p *parser) push(re *Regexp) *Regexp { if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] { // Single rune. if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) { return nil } re.Op = OpLiteral re.Rune = re.Rune[:1] re.Flags = p.flags &^ FoldCase } else if re.Op == OpCharClass && len(re.Rune) == 4 && re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] && unicode.SimpleFold(re.Rune[0]) == re.Rune[2] && unicode.SimpleFold(re.Rune[2]) == re.Rune[0] || re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0]+1 == re.Rune[1] && unicode.SimpleFold(re.Rune[0]) == re.Rune[1] && unicode.SimpleFold(re.Rune[1]) == re.Rune[0] { // Case-insensitive rune like [Aa] or [Δδ]. if p.maybeConcat(re.Rune[0], p.flags|FoldCase) { return nil } // Rewrite as (case-insensitive) literal. re.Op = OpLiteral re.Rune = re.Rune[:1] re.Flags = p.flags | FoldCase } else { // Incremental concatenation. p.maybeConcat(-1, 0) } p.stack = append(p.stack, re) return re }
// appendFoldedRange returns the result of appending the range lo-hi // and its case folding-equivalent runes to the class r. func appendFoldedRange(r []int, lo, hi int) []int { // Optimizations. if lo <= MinFold && hi >= MaxFold { // Range is full: folding can't add more. return AppendRange(r, lo, hi) } if hi < MinFold || lo > MaxFold { // Range is outside folding possibilities. return AppendRange(r, lo, hi) } if lo < MinFold { // [lo, MinFold-1] needs no folding. r = AppendRange(r, lo, MinFold-1) lo = MinFold } if hi > MaxFold { // [MaxFold+1, hi] needs no folding. r = AppendRange(r, MaxFold+1, hi) hi = MaxFold } // Brute force. Depend on AppendRange to coalesce ranges on the fly. for c := lo; c <= hi; c++ { r = AppendRange(r, c, c) f := unicode.SimpleFold(c) for f != c { r = AppendRange(r, f, f) f = unicode.SimpleFold(f) } } return r }
// equalFoldRune compares a and b runes whether they fold equally. // // The code comes from strings.EqualFold, but shortened to only one rune. func equalFoldRune(sr, tr rune) bool { if sr == tr { return true } // Make sr < tr to simplify what follows. if tr < sr { sr, tr = tr, sr } // Fast check for ASCII. if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' { // ASCII, and sr is upper case. tr must be lower case. if tr == sr+'a'-'A' { return true } return false } // General case. SimpleFold(x) returns the next equivalent rune > x // or wraps around to smaller values. r := unicode.SimpleFold(sr) for r != sr && r < tr { r = unicode.SimpleFold(r) } if r == tr { return true } return false }
// appendFoldedRange returns the result of appending the range lo-hi // and its case folding-equivalent runes to the class r. func appendFoldedRange(r []rune, lo, hi rune) []rune { // Optimizations. if lo <= minFold && hi >= maxFold { // Range is full: folding can't add more. return appendRange(r, lo, hi) } if hi < minFold || lo > maxFold { // Range is outside folding possibilities. return appendRange(r, lo, hi) } if lo < minFold { // [lo, minFold-1] needs no folding. r = appendRange(r, lo, minFold-1) lo = minFold } if hi > maxFold { // [maxFold+1, hi] needs no folding. r = appendRange(r, maxFold+1, hi) hi = maxFold } // Brute force. Depend on appendRange to coalesce ranges on the fly. for c := lo; c <= hi; c++ { r = appendRange(r, c, c) f := unicode.SimpleFold(c) for f != c { r = appendRange(r, f, f) f = unicode.SimpleFold(f) } } return r }
// Iterates through all versions (casings etc.) of a rune and compares to the // other rune. A generalized case insensitive compare. func EqualRuneFold(a, b rune) float32 { if a == b { return 0.0 } for c := unicode.SimpleFold(a); c != a; c = unicode.SimpleFold(c) { //fmt.Println("Compare", c, "and", b) if c == b { return 0.0 } } return 1.0 }
// EqualFold reports whether s and t, interpreted as UTF-8 strings, // are equal under Unicode case-folding. func EqualFold(s, t string) bool { for s != "" && t != "" { // Extract first rune from each string. var sr, tr rune if s[0] < utf8.RuneSelf { sr, s = rune(s[0]), s[1:] } else { r, size := utf8.DecodeRuneInString(s) sr, s = r, s[size:] } if t[0] < utf8.RuneSelf { tr, t = rune(t[0]), t[1:] } else { r, size := utf8.DecodeRuneInString(t) tr, t = r, t[size:] } // If they match, keep going; if not, return false. // Easy case. if tr == sr { continue } // Make sr < tr to simplify what follows. if tr < sr { tr, sr = sr, tr } // Fast check for ASCII. if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' { // ASCII, and sr is upper case. tr must be lower case. if tr == sr+'a'-'A' { continue } return false } // General case. SimpleFold(x) returns the next equivalent rune > x // or wraps around to smaller values. r := unicode.SimpleFold(sr) for r != sr && r < tr { r = unicode.SimpleFold(r) } if r == tr { continue } return false } // One string is empty. Are both? return s == t }
func asciiFold(r rune) bool { if r >= utf8.RuneSelf { return false } r1 := unicode.SimpleFold(r) if r1 >= utf8.RuneSelf { return false } if r1 == r { return true } return unicode.SimpleFold(r1) == r }
// minFoldRune returns the minimum rune fold-equivalent to r. func minFoldRune(r rune) rune { if r < minFold || r > maxFold { return r } min := r r0 := r for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) { if min > r { min = r } } return min }
func isUpperFold(rune int) bool { if unicode.IsUpper(rune) { return true } c := unicode.SimpleFold(rune) for c != rune { if unicode.IsUpper(c) { return true } c = unicode.SimpleFold(c) } return false }
// minFoldRune returns the minimum rune fold-equivalent to r. func minFoldRune(r int) int { if r < MinFold || r > MaxFold { return r } min := r r0 := r for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) { if min > r { min = r } } return min }
// rewrite takes a sequence of strings in, adds variants of the these strings // based on options and removes duplicates. func (r *rewriter) rewrite(ss []string) []string { ns := []string{} for _, s := range ss { ns = r.insert(ns, s) if r.addCases { rs := []rune(s) rn := rs[0] for c := unicode.SimpleFold(rn); c != rn; c = unicode.SimpleFold(c) { rs[0] = c ns = r.insert(ns, string(rs)) } } } return ns }
func TestCaseProperties(t *testing.T) { assigned := rangetable.Assigned(UnicodeVersion) coreVersion := rangetable.Assigned(unicode.Version) for r := rune(0); r <= lastRuneForTesting; r++ { if !unicode.In(r, assigned) || !unicode.In(r, coreVersion) { continue } c := contextFromRune(r) if got, want := c.info.isCaseIgnorable(), propIgnore(r); got != want { t.Errorf("caseIgnorable(%U): got %v; want %v (%x)", r, got, want, c.info) } // New letters may change case types, but existing case pairings should // not change. See Case Pair Stability in // http://unicode.org/policies/stability_policy.html. if rf := unicode.SimpleFold(r); rf != r && unicode.In(rf, assigned) { if got, want := c.info.isCased(), propCased(r); got != want { t.Errorf("cased(%U): got %v; want %v (%x)", r, got, want, c.info) } if got, want := c.caseType() == cUpper, propUpper(r); got != want { t.Errorf("upper(%U): got %v; want %v (%x)", r, got, want, c.info) } if got, want := c.caseType() == cLower, propLower(r); got != want { t.Errorf("lower(%U): got %v; want %v (%x)", r, got, want, c.info) } } if got, want := c.info.isBreak(), hasBreakProp(r); got != want { t.Errorf("isBreak(%U): got %v; want %v (%x)", r, got, want, c.info) } } // TODO: get title case from unicode file. }
func printCaseOrbit() { if *test { for i := range chars { c := &chars[i] f := c.caseOrbit if f == 0 { if c.lowerCase != i && c.lowerCase != 0 { f = c.lowerCase } else if c.upperCase != i && c.upperCase != 0 { f = c.upperCase } else { f = i } } if g := unicode.SimpleFold(i); g != f { fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f) } } return } fmt.Printf("var caseOrbit = []foldPair{\n") for i := range chars { c := &chars[i] if c.caseOrbit != 0 { fmt.Printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit) foldPairCount++ } } fmt.Printf("}\n\n") }
func printCaseOrbit() { if *test { for j := range chars { i := rune(j) c := &chars[i] f := c.caseOrbit if f == 0 { if c.lowerCase != i && c.lowerCase != 0 { f = c.lowerCase } else if c.upperCase != i && c.upperCase != 0 { f = c.upperCase } else { f = i } } if g := unicode.SimpleFold(i); g != f { fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f) } } return } ppt("const fold_pair _case_orbit[] = {\n") for i := range chars { c := &chars[i] if c.caseOrbit != 0 { ppt("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit) foldPairCount++ } } ppt("};\n") ppt("const slice<const fold_pair> case_orbit(_case_orbit);\n\n") }
func TestCaseProperties(t *testing.T) { if unicode.Version != UnicodeVersion { t.Skipf("UnicodeVersion=%s, but unicode.Version=%s", UnicodeVersion, unicode.Version) } assigned := rangetable.Assigned(UnicodeVersion) for r := rune(0); r <= lastRuneForTesting; r++ { if !unicode.In(r, assigned) || !unicode.In(unicode.SimpleFold(r), assigned) { continue } c := contextFromRune(r) if got, want := c.info.isCaseIgnorable(), propIgnore(r); got != want { t.Errorf("caseIgnorable(%U): got %v; want %v (%x)", r, got, want, c.info) } if got, want := c.info.isCased(), propCased(r); got != want { t.Errorf("cased(%U): got %v; want %v (%x)", r, got, want, c.info) } if got, want := c.caseType() == cUpper, propUpper(r); got != want { t.Errorf("upper(%U): got %v; want %v (%x)", r, got, want, c.info) } if got, want := c.caseType() == cLower, propLower(r); got != want { t.Errorf("lower(%U): got %v; want %v (%x)", r, got, want, c.info) } if got, want := c.info.isBreak(), hasBreakProp(r); got != want { t.Errorf("isBreak(%U): got %v; want %v (%x)", r, got, want, c.info) } } // TODO: get title case from unicode file. }
// toFold returns a string with the property that // strings.EqualFold(s, t) iff toFold(s) == toFold(t) // This lets us test a large set of strings for fold-equivalent // duplicates without making a quadratic number of calls // to EqualFold. Note that strings.ToUpper and strings.ToLower // have the desired property in some corner cases. func toFold(s string) string { // Fast path: all ASCII, no upper case. // Most paths look like this already. for i := 0; i < len(s); i++ { c := s[i] if c >= utf8.RuneSelf || 'A' <= c && c <= 'Z' { goto Slow } } return s Slow: var buf bytes.Buffer for _, r := range s { // SimpleFold(x) cycles to the next equivalent rune > x // or wraps around to smaller values. Iterate until it wraps, // and we've found the minimum value. for { r0 := r r = unicode.SimpleFold(r0) if r <= r0 { break } } // Exception to allow fast path above: A-Z => a-z if 'A' <= r && r <= 'Z' { r += 'a' - 'A' } buf.WriteRune(r) } return buf.String() }
// Fold returns a range containing all the runes from the original range and all the runes that can be obtained from them by using unicode case folding. The original range is not modified. func Fold(ranges []rune) []rune { if len(ranges) == 0 { return nil } rr := make([]rune, len(ranges)) copy(rr, ranges) for i := 0; i < len(ranges); i += 2 { for r := ranges[i]; r <= ranges[i+1]; r++ { r0 := r for r := unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) { rr = Add(rr, r) } } } return rr }
// MatchRunePos checks whether the instruction matches (and consumes) r. // If so, MatchRunePos returns the index of the matching rune pair // (or, when len(i.Rune) == 1, rune singleton). // If not, MatchRunePos returns -1. // MatchRunePos should only be called when i.Op == InstRune. func (i *Inst) MatchRunePos(r rune) int { rune := i.Rune // Special case: single-rune slice is from literal string, not char class. if len(rune) == 1 { r0 := rune[0] if r == r0 { return 0 } if Flags(i.Arg)&FoldCase != 0 { for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { if r == r1 { return 0 } } } return noMatch } // Peek at the first few pairs. // Should handle ASCII well. for j := 0; j < len(rune) && j <= 8; j += 2 { if r < rune[j] { return noMatch } if r <= rune[j+1] { return j / 2 } } // Otherwise binary search. lo := 0 hi := len(rune) / 2 for lo < hi { m := lo + (hi-lo)/2 if c := rune[2*m]; c <= r { if r <= rune[2*m+1] { return m } lo = m + 1 } else { hi = m } } return noMatch }
// simpleFold returns the minimum rune equivalent to r // under Unicode-defined simple case folding. func simpleFold(r rune) rune { for { r1 := unicode.SimpleFold(r) if r1 <= r { return r1 // wrapped around, found min } r = r1 } }
func TestMapping(t *testing.T) { assigned := rangetable.Assigned(UnicodeVersion) coreVersion := rangetable.Assigned(unicode.Version) if coreVersion == nil { coreVersion = assigned } apply := func(r rune, f func(c *context) bool) string { c := contextFromRune(r) f(c) return string(c.dst[:c.pDst]) } for r, tt := range special { if got, want := apply(r, lower), tt.toLower; got != want { t.Errorf("lowerSpecial:(%U): got %+q; want %+q", r, got, want) } if got, want := apply(r, title), tt.toTitle; got != want { t.Errorf("titleSpecial:(%U): got %+q; want %+q", r, got, want) } if got, want := apply(r, upper), tt.toUpper; got != want { t.Errorf("upperSpecial:(%U): got %+q; want %+q", r, got, want) } } for r := rune(0); r <= lastRuneForTesting; r++ { if !unicode.In(r, assigned) || !unicode.In(r, coreVersion) { continue } if rf := unicode.SimpleFold(r); rf == r || !unicode.In(rf, assigned) { continue } if _, ok := special[r]; ok { continue } want := string(unicode.ToLower(r)) if got := apply(r, lower); got != want { t.Errorf("lower:%q (%U): got %q %U; want %q %U", r, r, got, []rune(got), want, []rune(want)) } want = string(unicode.ToUpper(r)) if got := apply(r, upper); got != want { t.Errorf("upper:%q (%U): got %q %U; want %q %U", r, r, got, []rune(got), want, []rune(want)) } want = string(unicode.ToTitle(r)) if got := apply(r, title); got != want { t.Errorf("title:%q (%U): got %q %U; want %q %U", r, r, got, []rune(got), want, []rune(want)) } } }
func TestFoldConstants(t *testing.T) { last := -1 for i := 0; i <= unicode.MaxRune; i++ { if unicode.SimpleFold(i) == i { continue } if last == -1 && minFold != i { t.Errorf("minFold=%#U should be %#U", minFold, i) } last = i } if maxFold != last { t.Errorf("maxFold=%#U should be %#U", maxFold, last) } }
// ContainsFold is like strings.Contains but uses Unicode case-folding. func ContainsFold(s, substr string) bool { if substr == "" { return true } if s == "" { return false } firstRune := rune(substr[0]) if firstRune >= utf8.RuneSelf { firstRune, _ = utf8.DecodeRuneInString(substr) } firstLowerRune := unicode.SimpleFold(firstRune) for i, rune := range s { if len(s)-i < len(substr) { return false } if rune == firstLowerRune || unicode.SimpleFold(rune) == firstLowerRune { if HasPrefixFold(s[i:], substr) { return true } } } return false }
// Print the symbol as a suggestion. func suggestSymbol(tokRoot, tok, name string) (cnt int) { // Check if name is exported if !unexported { i := strings.LastIndex(name, ".") r1, _ := utf8.DecodeRuneInString(name) r2, _ := utf8.DecodeRuneInString(name[i+1:]) if !unicode.IsUpper(r1) || !unicode.IsUpper(r2) { return 0 } } if strings.Index(name, tok) == 0 { fmt.Println(tokRoot + name + " ") } else if !matchCase { // Copied from cmd/doc/pkg.go. simpleFold := func(r rune) rune { for { r1 := unicode.SimpleFold(r) if r1 <= r { return r1 } r = r1 } } // Copied from cmd/doc/pkg.go. for _, u := range tok { p, w := utf8.DecodeRuneInString(name) name = name[w:] if u == p { continue } if unicode.IsLower(u) && simpleFold(u) == simpleFold(p) { continue } return 0 } fmt.Println(tokRoot + tok + name + " ") } return 1 }
func ExampleSimpleFold() { fmt.Printf("%#U\n", unicode.SimpleFold('A')) // 'a' fmt.Printf("%#U\n", unicode.SimpleFold('a')) // 'A' fmt.Printf("%#U\n", unicode.SimpleFold('K')) // 'k' fmt.Printf("%#U\n", unicode.SimpleFold('k')) // '\u212A' (Kelvin symbol, K) fmt.Printf("%#U\n", unicode.SimpleFold('\u212A')) // 'K' fmt.Printf("%#U\n", unicode.SimpleFold('1')) // '1' // Output: // U+0061 'a' // U+0041 'A' // U+006B 'k' // U+212A 'K' // U+004B 'K' // U+0031 '1' }
func oneByteRange(i *syntax.Inst) (lo, hi byte, fold, ok bool) { if i.Op == syntax.InstRune1 { r := i.Rune[0] if r < utf8.RuneSelf { return byte(r), byte(r), false, true } } if i.Op != syntax.InstRune { return } fold = syntax.Flags(i.Arg)&syntax.FoldCase != 0 if len(i.Rune) == 1 || len(i.Rune) == 2 && i.Rune[0] == i.Rune[1] { r := i.Rune[0] if r >= utf8.RuneSelf { return } if fold && !asciiFold(r) { return } return byte(r), byte(r), fold, true } if len(i.Rune) == 2 && i.Rune[1] < utf8.RuneSelf { if fold { for r := i.Rune[0]; r <= i.Rune[1]; r++ { if asciiFold(r) { return } } } return byte(i.Rune[0]), byte(i.Rune[1]), fold, true } if len(i.Rune) == 4 && i.Rune[0] == i.Rune[1] && i.Rune[2] == i.Rune[3] && unicode.SimpleFold(i.Rune[0]) == i.Rune[2] && unicode.SimpleFold(i.Rune[2]) == i.Rune[0] { return byte(i.Rune[0]), byte(i.Rune[0]), true, true } return }
func (c *compiler) rune(r []rune, flags Flags) frag { f := c.inst(InstRune) i := &c.p.Inst[f.i] i.Rune = r flags &= FoldCase // only relevant flag is FoldCase if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] { // and sometimes not even that flags &^= FoldCase } i.Arg = uint32(flags) f.out = patchList(f.i << 1) // Special cases for exec machine. switch { case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]): i.Op = InstRune1 case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune: i.Op = InstRuneAny case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune: i.Op = InstRuneAnyNotNL } return f }
// dumpRegexp writes an encoding of the syntax tree for the regexp re to b. // It is used during testing to distinguish between parses that might print // the same using re's String method. func dumpRegexp(b *bytes.Buffer, re *Regexp) { if int(re.Op) >= len(opNames) || opNames[re.Op] == "" { fmt.Fprintf(b, "op%d", re.Op) } else { switch re.Op { default: b.WriteString(opNames[re.Op]) case OpStar, OpPlus, OpQuest, OpRepeat: if re.Flags&NonGreedy != 0 { b.WriteByte('n') } b.WriteString(opNames[re.Op]) case OpLiteral: if len(re.Rune) > 1 { b.WriteString("str") } else { b.WriteString("lit") } if re.Flags&FoldCase != 0 { for _, r := range re.Rune { if unicode.SimpleFold(r) != r { b.WriteString("fold") break } } } } } b.WriteByte('{') switch re.Op { case OpEndText: if re.Flags&WasDollar == 0 { b.WriteString(`\z`) } case OpLiteral: for _, r := range re.Rune { b.WriteRune(r) } case OpConcat, OpAlternate: for _, sub := range re.Sub { dumpRegexp(b, sub) } case OpStar, OpPlus, OpQuest: dumpRegexp(b, re.Sub[0]) case OpRepeat: fmt.Fprintf(b, "%d,%d ", re.Min, re.Max) dumpRegexp(b, re.Sub[0]) case OpCapture: if re.Name != "" { b.WriteString(re.Name) b.WriteByte(':') } dumpRegexp(b, re.Sub[0]) case OpCharClass: sep := "" for i := 0; i < len(re.Rune); i += 2 { b.WriteString(sep) sep = " " lo, hi := re.Rune[i], re.Rune[i+1] if lo == hi { fmt.Fprintf(b, "%#x", lo) } else { fmt.Fprintf(b, "%#x-%#x", lo, hi) } } } b.WriteByte('}') }
// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt, // the match engine can always tell which branch to take. The routine may modify // p if it is turned into a onepass Prog. If it isn't possible for this to be a // onepass Prog, the Prog syntax.NotOnePass is returned. makeOnePass is resursive // to the size of the Prog func (p *Prog) makeOnePass() *Prog { var ( instQueue = newQueue(len(p.Inst)) visitQueue = newQueue(len(p.Inst)) build func(uint32, *queue) check func(uint32, map[uint32]bool) bool onePassRunes = make([][]rune, len(p.Inst)) ) build = func(pc uint32, q *queue) { if q.contains(pc) { return } inst := p.Inst[pc] switch inst.Op { case InstAlt, InstAltMatch: q.insert(inst.Out) build(inst.Out, q) q.insert(inst.Arg) case InstMatch, InstFail: default: q.insert(inst.Out) } } // check that paths from Alt instructions are unambiguous, and rebuild the new // program as a onepass program check = func(pc uint32, m map[uint32]bool) (ok bool) { ok = true inst := &p.Inst[pc] if visitQueue.contains(pc) { return } visitQueue.insert(pc) switch inst.Op { case InstAlt, InstAltMatch: ok = check(inst.Out, m) && check(inst.Arg, m) // check no-input paths to InstMatch matchOut := m[inst.Out] matchArg := m[inst.Arg] if matchOut && matchArg { ok = false break } // Match on empty goes in inst.Out if matchArg { inst.Out, inst.Arg = inst.Arg, inst.Out matchOut, matchArg = matchArg, matchOut } if matchOut { m[pc] = true inst.Op = InstAltMatch } // build a dispatch operator from the two legs of the alt. onePassRunes[pc], inst.Next = mergeRuneSets( &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg) if len(inst.Next) > 0 && inst.Next[0] == mergeFailed { ok = false break } case InstCapture, InstNop: ok = check(inst.Out, m) m[pc] = m[inst.Out] // pass matching runes back through these no-ops. onePassRunes[pc] = append([]rune{}[:], onePassRunes[inst.Out][:]...) inst.Next = []uint32{} for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { inst.Next = append(inst.Next, inst.Out) } case InstEmptyWidth: ok = check(inst.Out, m) m[pc] = m[inst.Out] onePassRunes[pc] = append([]rune{}[:], onePassRunes[inst.Out][:]...) inst.Next = []uint32{} for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { inst.Next = append(inst.Next, inst.Out) } case InstMatch, InstFail: m[pc] = inst.Op == InstMatch break case InstRune: ok = check(inst.Out, m) m[pc] = false if len(inst.Next) > 0 { break } if len(inst.Rune) == 0 { onePassRunes[pc] = []rune{}[:] inst.Next = []uint32{inst.Out} break } runes := make([]rune, 0) if len(inst.Rune) == 1 && Flags(inst.Arg)&FoldCase != 0 { r0 := inst.Rune[0] runes = append(runes, r0, r0) for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { runes = append(runes, r1, r1) } sort.Sort(runeSlice(runes)) } else { runes = append(runes, inst.Rune...) } onePassRunes[pc] = runes inst.Next = []uint32{} for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { inst.Next = append(inst.Next, inst.Out) } inst.Op = InstRune case InstRune1: ok = check(inst.Out, m) m[pc] = false if len(inst.Next) > 0 { break } runes := []rune{}[:] // expand case-folded runes if Flags(inst.Arg)&FoldCase != 0 { r0 := inst.Rune[0] runes = append(runes, r0, r0) for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { runes = append(runes, r1, r1) } sort.Sort(runeSlice(runes)) } else { runes = append(runes, inst.Rune[0], inst.Rune[0]) } onePassRunes[pc] = runes inst.Next = []uint32{} for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { inst.Next = append(inst.Next, inst.Out) } inst.Op = InstRune case InstRuneAny: ok = check(inst.Out, m) m[pc] = false if len(inst.Next) > 0 { break } onePassRunes[pc] = append([]rune{}[:], anyRune[:]...) inst.Next = []uint32{inst.Out} case InstRuneAnyNotNL: ok = check(inst.Out, m) m[pc] = false if len(inst.Next) > 0 { break } onePassRunes[pc] = append([]rune{}[:], anyRuneNotNL[:]...) inst.Next = []uint32{} for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { inst.Next = append(inst.Next, inst.Out) } } return } instQueue.clear() instQueue.insert(uint32(p.Start)) m := make(map[uint32]bool, len(p.Inst)) for !instQueue.empty() { pc := instQueue.next() inst := p.Inst[pc] visitQueue.clear() if !check(uint32(pc), m) { p = NotOnePass break } switch inst.Op { case InstAlt, InstAltMatch: instQueue.insert(inst.Out) instQueue.insert(inst.Arg) case InstCapture, InstEmptyWidth, InstNop: instQueue.insert(inst.Out) case InstMatch: case InstFail: case InstRune, InstRune1, InstRuneAny, InstRuneAnyNotNL: default: } } if p != NotOnePass { for i, _ := range p.Inst { p.Inst[i].Rune = onePassRunes[i][:] } } return p }
// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt, // the match engine can always tell which branch to take. The routine may modify // p if it is turned into a onepass Prog. If it isn't possible for this to be a // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive // to the size of the Prog. func makeOnePass(p *onePassProg) *onePassProg { // If the machine is very long, it's not worth the time to check if we can use one pass. if len(p.Inst) >= 1000 { return notOnePass } var ( instQueue = newQueue(len(p.Inst)) visitQueue = newQueue(len(p.Inst)) check func(uint32, map[uint32]bool) bool onePassRunes = make([][]rune, len(p.Inst)) ) // check that paths from Alt instructions are unambiguous, and rebuild the new // program as a onepass program check = func(pc uint32, m map[uint32]bool) (ok bool) { ok = true inst := &p.Inst[pc] if visitQueue.contains(pc) { return } visitQueue.insert(pc) switch inst.Op { case syntax.InstAlt, syntax.InstAltMatch: ok = check(inst.Out, m) && check(inst.Arg, m) // check no-input paths to InstMatch matchOut := m[inst.Out] matchArg := m[inst.Arg] if matchOut && matchArg { ok = false break } // Match on empty goes in inst.Out if matchArg { inst.Out, inst.Arg = inst.Arg, inst.Out matchOut, matchArg = matchArg, matchOut } if matchOut { m[pc] = true inst.Op = syntax.InstAltMatch } // build a dispatch operator from the two legs of the alt. onePassRunes[pc], inst.Next = mergeRuneSets( &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg) if len(inst.Next) > 0 && inst.Next[0] == mergeFailed { ok = false break } case syntax.InstCapture, syntax.InstNop: ok = check(inst.Out, m) m[pc] = m[inst.Out] // pass matching runes back through these no-ops. onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) inst.Next = []uint32{} for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { inst.Next = append(inst.Next, inst.Out) } case syntax.InstEmptyWidth: ok = check(inst.Out, m) m[pc] = m[inst.Out] onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) inst.Next = []uint32{} for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { inst.Next = append(inst.Next, inst.Out) } case syntax.InstMatch, syntax.InstFail: m[pc] = inst.Op == syntax.InstMatch break case syntax.InstRune: m[pc] = false if len(inst.Next) > 0 { break } instQueue.insert(inst.Out) if len(inst.Rune) == 0 { onePassRunes[pc] = []rune{} inst.Next = []uint32{inst.Out} break } runes := make([]rune, 0) if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { r0 := inst.Rune[0] runes = append(runes, r0, r0) for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { runes = append(runes, r1, r1) } sort.Sort(runeSlice(runes)) } else { runes = append(runes, inst.Rune...) } onePassRunes[pc] = runes inst.Next = []uint32{} for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { inst.Next = append(inst.Next, inst.Out) } inst.Op = syntax.InstRune case syntax.InstRune1: m[pc] = false if len(inst.Next) > 0 { break } instQueue.insert(inst.Out) runes := []rune{} // expand case-folded runes if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { r0 := inst.Rune[0] runes = append(runes, r0, r0) for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { runes = append(runes, r1, r1) } sort.Sort(runeSlice(runes)) } else { runes = append(runes, inst.Rune[0], inst.Rune[0]) } onePassRunes[pc] = runes inst.Next = []uint32{} for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { inst.Next = append(inst.Next, inst.Out) } inst.Op = syntax.InstRune case syntax.InstRuneAny: m[pc] = false if len(inst.Next) > 0 { break } instQueue.insert(inst.Out) onePassRunes[pc] = append([]rune{}, anyRune...) inst.Next = []uint32{inst.Out} case syntax.InstRuneAnyNotNL: m[pc] = false if len(inst.Next) > 0 { break } instQueue.insert(inst.Out) onePassRunes[pc] = append([]rune{}, anyRuneNotNL...) inst.Next = []uint32{} for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { inst.Next = append(inst.Next, inst.Out) } } return } instQueue.clear() instQueue.insert(uint32(p.Start)) m := make(map[uint32]bool, len(p.Inst)) for !instQueue.empty() { visitQueue.clear() pc := instQueue.next() if !check(uint32(pc), m) { p = notOnePass break } } if p != notOnePass { for i := range p.Inst { p.Inst[i].Rune = onePassRunes[i] } } return p }
// analyze returns the regexpInfo for the regexp re. func analyze(re *syntax.Regexp) (ret regexpInfo) { var info regexpInfo switch re.Op { case syntax.OpNoMatch: return noMatch() case syntax.OpEmptyMatch, syntax.OpBeginLine, syntax.OpEndLine, syntax.OpBeginText, syntax.OpEndText, syntax.OpWordBoundary, syntax.OpNoWordBoundary: return emptyString() case syntax.OpLiteral: if re.Flags&syntax.FoldCase != 0 { switch len(re.Rune) { case 0: return emptyString() case 1: // Single-letter case-folded string: // rewrite into char class and analyze. re1 := &syntax.Regexp{ Op: syntax.OpCharClass, } re1.Rune = re1.Rune0[:0] r0 := re.Rune[0] re1.Rune = append(re1.Rune, r0, r0) for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { re1.Rune = append(re1.Rune, r1, r1) } info = analyze(re1) return info } // Multi-letter case-folded string: // treat as concatenation of single-letter case-folded strings. re1 := &syntax.Regexp{ Op: syntax.OpLiteral, Flags: syntax.FoldCase, } info = emptyString() for i := range re.Rune { re1.Rune = re.Rune[i : i+1] info = concat(info, analyze(re1)) } return info } info.exact = stringSet{string(re.Rune)} case syntax.OpAnyCharNotNL, syntax.OpAnyChar: return anyChar() case syntax.OpCapture: return analyze(re.Sub[0]) case syntax.OpConcat: return fold(concat, re.Sub, emptyString()) case syntax.OpAlternate: return fold(alternate, re.Sub, noMatch()) case syntax.OpQuest: return alternate(analyze(re.Sub[0]), emptyString()) case syntax.OpStar: // We don't know anything, so assume the worst. return anyMatch() case syntax.OpRepeat: if re.Min == 0 { // Like OpStar return anyMatch() } fallthrough case syntax.OpPlus: // x+ // Since there has to be at least one x, the prefixes and suffixes // stay the same. If x was exact, it isn't anymore. info = analyze(re.Sub[0]) if info.exact.have() { info.prefix = info.exact info.suffix = info.exact.copy() info.exact = nil } case syntax.OpCharClass: // Special case. if len(re.Rune) == 0 { return noMatch() } // Special case. if len(re.Rune) == 1 { info.exact = stringSet{string(re.Rune[0])} break } n := 0 for i := 0; i < len(re.Rune); i += 2 { n += int(re.Rune[i+1] - re.Rune[i]) } // If the class is too large, it's okay to overestimate. if n > 100 { return anyChar() } info.exact = []string{} for i := 0; i < len(re.Rune); i += 2 { lo, hi := re.Rune[i], re.Rune[i+1] for rr := lo; rr <= hi; rr++ { info.exact.add(string(rr)) } } } info.simplify(false) return info }