// OnePassPrefix returns a literal string that all matches for the // regexp must start with. Complete is true if the prefix // is the entire match. Pc is the index of the last rune instruction // in the string. The OnePassPrefix skips over the mandatory // EmptyBeginText func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) { i := &p.Inst[p.Start] if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 { return "", i.Op == syntax.InstMatch, uint32(p.Start) } pc = i.Out i = &p.Inst[pc] for i.Op == syntax.InstNop { pc = i.Out i = &p.Inst[pc] } // Avoid allocation of buffer if prefix is empty. if iop(i) != syntax.InstRune || len(i.Rune) != 1 { return "", i.Op == syntax.InstMatch, uint32(p.Start) } // Have prefix; gather characters. var buf bytes.Buffer for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 { buf.WriteRune(i.Rune[0]) pc, i = i.Out, &p.Inst[i.Out] } if i.Op == syntax.InstEmptyWidth && syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 && p.Inst[i.Out].Op == syntax.InstMatch { complete = true } return buf.String(), complete, pc }
/* I'm sorry, dear reader. I really am. The problem here is to take an arbitrary regular expression and: 1. return a regular expression that is just like it, but left-anchored, preferring to return the original if possible. 2. determine a string literal prefix that all matches of this regular expression have, much like regexp.Regexp.Prefix(). Unfortunately, Prefix() does not work in the presence of anchors, so we need to write it ourselves. What this actually means is that we need to sketch on the internals of the standard regexp library to forcefully extract the information we want. Unfortunately, regexp.Regexp hides a lot of its state, so our abstraction is going to be pretty leaky. The biggest leak is that we blindly assume that all regular expressions are perl-style, not POSIX. This is probably Mostly True, and I think most users of the library probably won't be able to notice. */ func sketchOnRegex(re *regexp.Regexp) (*regexp.Regexp, string) { rawRe := re.String() sRe, err := syntax.Parse(rawRe, syntax.Perl) if err != nil { log.Printf("WARN(web): unable to parse regexp %v as perl. "+ "This route might behave unexpectedly.", re) return re, "" } sRe = sRe.Simplify() p, err := syntax.Compile(sRe) if err != nil { log.Printf("WARN(web): unable to compile regexp %v. This "+ "route might behave unexpectedly.", re) return re, "" } if p.StartCond()&syntax.EmptyBeginText == 0 { // I hope doing this is always legal... newRe, err := regexp.Compile(`\A` + rawRe) if err != nil { log.Printf("WARN(web): unable to create a left-"+ "anchored regexp from %v. This route might "+ "behave unexpectedly", re) return re, "" } re = newRe } // Run the regular expression more or less by hand :( pc := uint32(p.Start) atStart := true i := &p.Inst[pc] var buf bytes.Buffer Sadness: for { switch i.Op { case syntax.InstEmptyWidth: if !atStart { break Sadness } case syntax.InstCapture, syntax.InstNop: // nop! case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: atStart = false if len(i.Rune) != 1 || syntax.Flags(i.Arg)&syntax.FoldCase != 0 { break Sadness } buf.WriteRune(i.Rune[0]) default: break Sadness } pc = i.Out i = &p.Inst[pc] } return re, buf.String() }
func toByteProg(prog *syntax.Prog) error { var b runeBuilder for pc := range prog.Inst { i := &prog.Inst[pc] switch i.Op { case syntax.InstRune, syntax.InstRune1: // General rune range. PIA. // TODO: Pick off single-byte case. if lo, hi, fold, ok := oneByteRange(i); ok { i.Op = instByteRange i.Arg = uint32(lo)<<8 | uint32(hi) if fold { i.Arg |= argFold } break } r := i.Rune if syntax.Flags(i.Arg)&syntax.FoldCase != 0 { // Build folded list. var rr []rune if len(r) == 1 { rr = appendFoldedRange(rr, r[0], r[0]) } else { for j := 0; j < len(r); j += 2 { rr = appendFoldedRange(rr, r[j], r[j+1]) } } r = rr } b.init(prog, uint32(pc), i.Out) if len(r) == 1 { b.addRange(r[0], r[0], false) } else { for j := 0; j < len(r); j += 2 { b.addRange(r[j], r[j+1], false) } } case syntax.InstRuneAny, syntax.InstRuneAnyNotNL: // All runes. // AnyNotNL should exclude \n but the line-at-a-time // execution takes care of that for us. b.init(prog, uint32(pc), i.Out) b.addRange(0, unicode.MaxRune, false) } } return nil }
func dumpInst(b *bytes.Buffer, i *syntax.Inst) { switch i.Op { case syntax.InstAlt: bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg)) case syntax.InstAltMatch: bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg)) case syntax.InstCapture: bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out)) case syntax.InstEmptyWidth: bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out)) case syntax.InstMatch: bw(b, "match") case syntax.InstFail: bw(b, "fail") case syntax.InstNop: bw(b, "nop -> ", u32(i.Out)) case instByteRange: fmt.Fprintf(b, "byte %02x-%02x", (i.Arg>>8)&0xFF, i.Arg&0xFF) if i.Arg&argFold != 0 { bw(b, "/i") } bw(b, " -> ", u32(i.Out)) // Should not happen case syntax.InstRune: if i.Rune == nil { // shouldn't happen bw(b, "rune <nil>") } bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune))) if syntax.Flags(i.Arg)&syntax.FoldCase != 0 { bw(b, "/i") } bw(b, " -> ", u32(i.Out)) case syntax.InstRune1: bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out)) case syntax.InstRuneAny: bw(b, "any -> ", u32(i.Out)) case syntax.InstRuneAnyNotNL: bw(b, "anynotnl -> ", u32(i.Out)) } }
func oneByteRange(i *syntax.Inst) (lo, hi byte, fold, ok bool) { if i.Op == syntax.InstRune1 { r := i.Rune[0] if r < utf8.RuneSelf { return byte(r), byte(r), false, true } } if i.Op != syntax.InstRune { return } fold = syntax.Flags(i.Arg)&syntax.FoldCase != 0 if len(i.Rune) == 1 || len(i.Rune) == 2 && i.Rune[0] == i.Rune[1] { r := i.Rune[0] if r >= utf8.RuneSelf { return } if fold && !asciiFold(r) { return } return byte(r), byte(r), fold, true } if len(i.Rune) == 2 && i.Rune[1] < utf8.RuneSelf { if fold { for r := i.Rune[0]; r <= i.Rune[1]; r++ { if asciiFold(r) { return } } } return byte(i.Rune[0]), byte(i.Rune[1]), fold, true } if len(i.Rune) == 4 && i.Rune[0] == i.Rune[1] && i.Rune[2] == i.Rune[3] && unicode.SimpleFold(i.Rune[0]) == i.Rune[2] && unicode.SimpleFold(i.Rune[2]) == i.Rune[0] { return byte(i.Rune[0]), byte(i.Rune[0]), true, true } return }
/* I'm sorry, dear reader. I really am. The problem here is to take an arbitrary regular expression and: 1. return a regular expression that is just like it, but left-anchored, preferring to return the original if possible. 2. determine a string literal prefix that all matches of this regular expression have, much like regexp.Regexp.Prefix(). Unfortunately, Prefix() does not work in the presence of anchors, so we need to write it ourselves. What this actually means is that we need to sketch on the internals of the standard regexp library to forcefully extract the information we want. Unfortunately, regexp.Regexp hides a lot of its state, so our abstraction is going to be pretty leaky. The biggest leak is that we blindly assume that all regular expressions are perl-style, not POSIX. This is probably Mostly True, and I think most users of the library probably won't be able to notice. */ func sketchOnRegex(re *regexp.Regexp) (*regexp.Regexp, string) { // Re-parse the regex from the string representation. rawRe := re.String() sRe, err := syntax.Parse(rawRe, syntax.Perl) if err != nil { // TODO: better way to warn? log.Printf("WARN(router): unable to parse regexp %v as perl. "+ "This route might behave unexpectedly.", re) return re, "" } // Simplify and then compile the regex. sRe = sRe.Simplify() p, err := syntax.Compile(sRe) if err != nil { // TODO: better way to warn? log.Printf("WARN(router): unable to compile regexp %v. This "+ "route might behave unexpectedly.", re) return re, "" } // If it's not left-anchored, we add that now. if p.StartCond()&syntax.EmptyBeginText == 0 { // I hope doing this is always legal... newRe, err := regexp.Compile(`\A` + rawRe) if err != nil { // TODO: better way to warn? log.Printf("WARN(router): unable to create a left-"+ "anchored regexp from %v. This route might "+ "behave unexpectedly", re) return re, "" } re = newRe } // We run the regular expression more or less by hand in order to calculate // the prefix. pc := uint32(p.Start) atStart := true i := &p.Inst[pc] var buf bytes.Buffer OuterLoop: for { switch i.Op { // There's may be an 'empty' operation at the beginning of every regex, // due to OpBeginText. case syntax.InstEmptyWidth: if !atStart { break OuterLoop } // Captures and no-ops don't affect the prefix case syntax.InstCapture, syntax.InstNop: // nop! // We handle runes case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: atStart = false // If we don't have exactly one rune, or if the 'fold case' flag is // set, then we don't count this as part of the prefix. Due to // unicode case-crazyness, it's too hard to deal with case // insensitivity... if len(i.Rune) != 1 || syntax.Flags(i.Arg)&syntax.FoldCase != 0 { break OuterLoop } // Add to the prefix, continue. buf.WriteRune(i.Rune[0]) // All other instructions may affect the prefix, so we continue. default: break OuterLoop } // Continue to the next instruction pc = i.Out i = &p.Inst[pc] } return re, buf.String() }
// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt, // the match engine can always tell which branch to take. The routine may modify // p if it is turned into a onepass Prog. If it isn't possible for this to be a // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive // to the size of the Prog. func makeOnePass(p *onePassProg) *onePassProg { // If the machine is very long, it's not worth the time to check if we can use one pass. if len(p.Inst) >= 1000 { return notOnePass } var ( instQueue = newQueue(len(p.Inst)) visitQueue = newQueue(len(p.Inst)) check func(uint32, map[uint32]bool) bool onePassRunes = make([][]rune, len(p.Inst)) ) // check that paths from Alt instructions are unambiguous, and rebuild the new // program as a onepass program check = func(pc uint32, m map[uint32]bool) (ok bool) { ok = true inst := &p.Inst[pc] if visitQueue.contains(pc) { return } visitQueue.insert(pc) switch inst.Op { case syntax.InstAlt, syntax.InstAltMatch: ok = check(inst.Out, m) && check(inst.Arg, m) // check no-input paths to InstMatch matchOut := m[inst.Out] matchArg := m[inst.Arg] if matchOut && matchArg { ok = false break } // Match on empty goes in inst.Out if matchArg { inst.Out, inst.Arg = inst.Arg, inst.Out matchOut, matchArg = matchArg, matchOut } if matchOut { m[pc] = true inst.Op = syntax.InstAltMatch } // build a dispatch operator from the two legs of the alt. onePassRunes[pc], inst.Next = mergeRuneSets( &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg) if len(inst.Next) > 0 && inst.Next[0] == mergeFailed { ok = false break } case syntax.InstCapture, syntax.InstNop: ok = check(inst.Out, m) m[pc] = m[inst.Out] // pass matching runes back through these no-ops. onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) inst.Next = []uint32{} for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { inst.Next = append(inst.Next, inst.Out) } case syntax.InstEmptyWidth: ok = check(inst.Out, m) m[pc] = m[inst.Out] onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) inst.Next = []uint32{} for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { inst.Next = append(inst.Next, inst.Out) } case syntax.InstMatch, syntax.InstFail: m[pc] = inst.Op == syntax.InstMatch break case syntax.InstRune: m[pc] = false if len(inst.Next) > 0 { break } instQueue.insert(inst.Out) if len(inst.Rune) == 0 { onePassRunes[pc] = []rune{} inst.Next = []uint32{inst.Out} break } runes := make([]rune, 0) if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { r0 := inst.Rune[0] runes = append(runes, r0, r0) for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { runes = append(runes, r1, r1) } sort.Sort(runeSlice(runes)) } else { runes = append(runes, inst.Rune...) } onePassRunes[pc] = runes inst.Next = []uint32{} for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { inst.Next = append(inst.Next, inst.Out) } inst.Op = syntax.InstRune case syntax.InstRune1: m[pc] = false if len(inst.Next) > 0 { break } instQueue.insert(inst.Out) runes := []rune{} // expand case-folded runes if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { r0 := inst.Rune[0] runes = append(runes, r0, r0) for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { runes = append(runes, r1, r1) } sort.Sort(runeSlice(runes)) } else { runes = append(runes, inst.Rune[0], inst.Rune[0]) } onePassRunes[pc] = runes inst.Next = []uint32{} for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { inst.Next = append(inst.Next, inst.Out) } inst.Op = syntax.InstRune case syntax.InstRuneAny: m[pc] = false if len(inst.Next) > 0 { break } instQueue.insert(inst.Out) onePassRunes[pc] = append([]rune{}, anyRune...) inst.Next = []uint32{inst.Out} case syntax.InstRuneAnyNotNL: m[pc] = false if len(inst.Next) > 0 { break } instQueue.insert(inst.Out) onePassRunes[pc] = append([]rune{}, anyRuneNotNL...) inst.Next = []uint32{} for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { inst.Next = append(inst.Next, inst.Out) } } return } instQueue.clear() instQueue.insert(uint32(p.Start)) m := make(map[uint32]bool, len(p.Inst)) for !instQueue.empty() { visitQueue.clear() pc := instQueue.next() if !check(uint32(pc), m) { p = notOnePass break } } if p != notOnePass { for i := range p.Inst { p.Inst[i].Rune = onePassRunes[i] } } return p }