Exemple #1
0
// OnePassPrefix returns a literal string that all matches for the
// regexp must start with.  Complete is true if the prefix
// is the entire match. Pc is the index of the last rune instruction
// in the string. The OnePassPrefix skips over the mandatory
// EmptyBeginText
func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
	i := &p.Inst[p.Start]
	if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
		return "", i.Op == syntax.InstMatch, uint32(p.Start)
	}
	pc = i.Out
	i = &p.Inst[pc]
	for i.Op == syntax.InstNop {
		pc = i.Out
		i = &p.Inst[pc]
	}
	// Avoid allocation of buffer if prefix is empty.
	if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
		return "", i.Op == syntax.InstMatch, uint32(p.Start)
	}

	// Have prefix; gather characters.
	var buf bytes.Buffer
	for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
		buf.WriteRune(i.Rune[0])
		pc, i = i.Out, &p.Inst[i.Out]
	}
	if i.Op == syntax.InstEmptyWidth &&
		syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 &&
		p.Inst[i.Out].Op == syntax.InstMatch {
		complete = true
	}
	return buf.String(), complete, pc
}
Exemple #2
0
/*
I'm sorry, dear reader. I really am.

The problem here is to take an arbitrary regular expression and:
1. return a regular expression that is just like it, but left-anchored,
   preferring to return the original if possible.
2. determine a string literal prefix that all matches of this regular expression
   have, much like regexp.Regexp.Prefix(). Unfortunately, Prefix() does not work
   in the presence of anchors, so we need to write it ourselves.

What this actually means is that we need to sketch on the internals of the
standard regexp library to forcefully extract the information we want.

Unfortunately, regexp.Regexp hides a lot of its state, so our abstraction is
going to be pretty leaky. The biggest leak is that we blindly assume that all
regular expressions are perl-style, not POSIX. This is probably Mostly True, and
I think most users of the library probably won't be able to notice.
*/
func sketchOnRegex(re *regexp.Regexp) (*regexp.Regexp, string) {
	rawRe := re.String()
	sRe, err := syntax.Parse(rawRe, syntax.Perl)
	if err != nil {
		log.Printf("WARN(web): unable to parse regexp %v as perl. "+
			"This route might behave unexpectedly.", re)
		return re, ""
	}
	sRe = sRe.Simplify()
	p, err := syntax.Compile(sRe)
	if err != nil {
		log.Printf("WARN(web): unable to compile regexp %v. This "+
			"route might behave unexpectedly.", re)
		return re, ""
	}
	if p.StartCond()&syntax.EmptyBeginText == 0 {
		// I hope doing this is always legal...
		newRe, err := regexp.Compile(`\A` + rawRe)
		if err != nil {
			log.Printf("WARN(web): unable to create a left-"+
				"anchored regexp from %v. This route might "+
				"behave unexpectedly", re)
			return re, ""
		}
		re = newRe
	}

	// Run the regular expression more or less by hand :(
	pc := uint32(p.Start)
	atStart := true
	i := &p.Inst[pc]
	var buf bytes.Buffer
Sadness:
	for {
		switch i.Op {
		case syntax.InstEmptyWidth:
			if !atStart {
				break Sadness
			}
		case syntax.InstCapture, syntax.InstNop:
			// nop!
		case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny,
			syntax.InstRuneAnyNotNL:

			atStart = false
			if len(i.Rune) != 1 ||
				syntax.Flags(i.Arg)&syntax.FoldCase != 0 {
				break Sadness
			}
			buf.WriteRune(i.Rune[0])
		default:
			break Sadness
		}
		pc = i.Out
		i = &p.Inst[pc]
	}
	return re, buf.String()
}
Exemple #3
0
func toByteProg(prog *syntax.Prog) error {
	var b runeBuilder
	for pc := range prog.Inst {
		i := &prog.Inst[pc]
		switch i.Op {
		case syntax.InstRune, syntax.InstRune1:
			// General rune range.  PIA.
			// TODO: Pick off single-byte case.
			if lo, hi, fold, ok := oneByteRange(i); ok {
				i.Op = instByteRange
				i.Arg = uint32(lo)<<8 | uint32(hi)
				if fold {
					i.Arg |= argFold
				}
				break
			}

			r := i.Rune
			if syntax.Flags(i.Arg)&syntax.FoldCase != 0 {
				// Build folded list.
				var rr []rune
				if len(r) == 1 {
					rr = appendFoldedRange(rr, r[0], r[0])
				} else {
					for j := 0; j < len(r); j += 2 {
						rr = appendFoldedRange(rr, r[j], r[j+1])
					}
				}
				r = rr
			}

			b.init(prog, uint32(pc), i.Out)
			if len(r) == 1 {
				b.addRange(r[0], r[0], false)
			} else {
				for j := 0; j < len(r); j += 2 {
					b.addRange(r[j], r[j+1], false)
				}
			}

		case syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
			// All runes.
			// AnyNotNL should exclude \n but the line-at-a-time
			// execution takes care of that for us.
			b.init(prog, uint32(pc), i.Out)
			b.addRange(0, unicode.MaxRune, false)
		}
	}
	return nil
}
Exemple #4
0
func dumpInst(b *bytes.Buffer, i *syntax.Inst) {
	switch i.Op {
	case syntax.InstAlt:
		bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
	case syntax.InstAltMatch:
		bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
	case syntax.InstCapture:
		bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
	case syntax.InstEmptyWidth:
		bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
	case syntax.InstMatch:
		bw(b, "match")
	case syntax.InstFail:
		bw(b, "fail")
	case syntax.InstNop:
		bw(b, "nop -> ", u32(i.Out))
	case instByteRange:
		fmt.Fprintf(b, "byte %02x-%02x", (i.Arg>>8)&0xFF, i.Arg&0xFF)
		if i.Arg&argFold != 0 {
			bw(b, "/i")
		}
		bw(b, " -> ", u32(i.Out))

	// Should not happen
	case syntax.InstRune:
		if i.Rune == nil {
			// shouldn't happen
			bw(b, "rune <nil>")
		}
		bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
		if syntax.Flags(i.Arg)&syntax.FoldCase != 0 {
			bw(b, "/i")
		}
		bw(b, " -> ", u32(i.Out))
	case syntax.InstRune1:
		bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
	case syntax.InstRuneAny:
		bw(b, "any -> ", u32(i.Out))
	case syntax.InstRuneAnyNotNL:
		bw(b, "anynotnl -> ", u32(i.Out))
	}
}
Exemple #5
0
func oneByteRange(i *syntax.Inst) (lo, hi byte, fold, ok bool) {
	if i.Op == syntax.InstRune1 {
		r := i.Rune[0]
		if r < utf8.RuneSelf {
			return byte(r), byte(r), false, true
		}
	}
	if i.Op != syntax.InstRune {
		return
	}
	fold = syntax.Flags(i.Arg)&syntax.FoldCase != 0
	if len(i.Rune) == 1 || len(i.Rune) == 2 && i.Rune[0] == i.Rune[1] {
		r := i.Rune[0]
		if r >= utf8.RuneSelf {
			return
		}
		if fold && !asciiFold(r) {
			return
		}
		return byte(r), byte(r), fold, true
	}
	if len(i.Rune) == 2 && i.Rune[1] < utf8.RuneSelf {
		if fold {
			for r := i.Rune[0]; r <= i.Rune[1]; r++ {
				if asciiFold(r) {
					return
				}
			}
		}
		return byte(i.Rune[0]), byte(i.Rune[1]), fold, true
	}
	if len(i.Rune) == 4 && i.Rune[0] == i.Rune[1] && i.Rune[2] == i.Rune[3] && unicode.SimpleFold(i.Rune[0]) == i.Rune[2] && unicode.SimpleFold(i.Rune[2]) == i.Rune[0] {
		return byte(i.Rune[0]), byte(i.Rune[0]), true, true
	}

	return
}
Exemple #6
0
/*
I'm sorry, dear reader. I really am.

The problem here is to take an arbitrary regular expression and:
1. return a regular expression that is just like it, but left-anchored,
   preferring to return the original if possible.
2. determine a string literal prefix that all matches of this regular expression
   have, much like regexp.Regexp.Prefix(). Unfortunately, Prefix() does not work
   in the presence of anchors, so we need to write it ourselves.

What this actually means is that we need to sketch on the internals of the
standard regexp library to forcefully extract the information we want.

Unfortunately, regexp.Regexp hides a lot of its state, so our abstraction is
going to be pretty leaky. The biggest leak is that we blindly assume that all
regular expressions are perl-style, not POSIX. This is probably Mostly True, and
I think most users of the library probably won't be able to notice.
*/
func sketchOnRegex(re *regexp.Regexp) (*regexp.Regexp, string) {
	// Re-parse the regex from the string representation.
	rawRe := re.String()
	sRe, err := syntax.Parse(rawRe, syntax.Perl)
	if err != nil {
		// TODO: better way to warn?
		log.Printf("WARN(router): unable to parse regexp %v as perl. "+
			"This route might behave unexpectedly.", re)
		return re, ""
	}

	// Simplify and then compile the regex.
	sRe = sRe.Simplify()
	p, err := syntax.Compile(sRe)
	if err != nil {
		// TODO: better way to warn?
		log.Printf("WARN(router): unable to compile regexp %v. This "+
			"route might behave unexpectedly.", re)
		return re, ""
	}

	// If it's not left-anchored, we add that now.
	if p.StartCond()&syntax.EmptyBeginText == 0 {
		// I hope doing this is always legal...
		newRe, err := regexp.Compile(`\A` + rawRe)
		if err != nil {
			// TODO: better way to warn?
			log.Printf("WARN(router): unable to create a left-"+
				"anchored regexp from %v. This route might "+
				"behave unexpectedly", re)
			return re, ""
		}
		re = newRe
	}

	// We run the regular expression more or less by hand in order to calculate
	// the prefix.
	pc := uint32(p.Start)
	atStart := true
	i := &p.Inst[pc]
	var buf bytes.Buffer
OuterLoop:
	for {
		switch i.Op {

		// There's may be an 'empty' operation at the beginning of every regex,
		// due to OpBeginText.
		case syntax.InstEmptyWidth:
			if !atStart {
				break OuterLoop
			}

		// Captures and no-ops don't affect the prefix
		case syntax.InstCapture, syntax.InstNop:
			// nop!

		// We handle runes
		case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny,
			syntax.InstRuneAnyNotNL:

			atStart = false

			// If we don't have exactly one rune, or if the 'fold case' flag is
			// set, then we don't count this as part of the prefix.  Due to
			// unicode case-crazyness, it's too hard to deal with case
			// insensitivity...
			if len(i.Rune) != 1 ||
				syntax.Flags(i.Arg)&syntax.FoldCase != 0 {
				break OuterLoop
			}

			// Add to the prefix, continue.
			buf.WriteRune(i.Rune[0])

		// All other instructions may affect the prefix, so we continue.
		default:
			break OuterLoop
		}

		// Continue to the next instruction
		pc = i.Out
		i = &p.Inst[pc]
	}

	return re, buf.String()
}
Exemple #7
0
// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
// the match engine can always tell which branch to take. The routine may modify
// p if it is turned into a onepass Prog. If it isn't possible for this to be a
// onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive
// to the size of the Prog.
func makeOnePass(p *onePassProg) *onePassProg {
	// If the machine is very long, it's not worth the time to check if we can use one pass.
	if len(p.Inst) >= 1000 {
		return notOnePass
	}

	var (
		instQueue    = newQueue(len(p.Inst))
		visitQueue   = newQueue(len(p.Inst))
		check        func(uint32, map[uint32]bool) bool
		onePassRunes = make([][]rune, len(p.Inst))
	)

	// check that paths from Alt instructions are unambiguous, and rebuild the new
	// program as a onepass program
	check = func(pc uint32, m map[uint32]bool) (ok bool) {
		ok = true
		inst := &p.Inst[pc]
		if visitQueue.contains(pc) {
			return
		}
		visitQueue.insert(pc)
		switch inst.Op {
		case syntax.InstAlt, syntax.InstAltMatch:
			ok = check(inst.Out, m) && check(inst.Arg, m)
			// check no-input paths to InstMatch
			matchOut := m[inst.Out]
			matchArg := m[inst.Arg]
			if matchOut && matchArg {
				ok = false
				break
			}
			// Match on empty goes in inst.Out
			if matchArg {
				inst.Out, inst.Arg = inst.Arg, inst.Out
				matchOut, matchArg = matchArg, matchOut
			}
			if matchOut {
				m[pc] = true
				inst.Op = syntax.InstAltMatch
			}

			// build a dispatch operator from the two legs of the alt.
			onePassRunes[pc], inst.Next = mergeRuneSets(
				&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
			if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
				ok = false
				break
			}
		case syntax.InstCapture, syntax.InstNop:
			ok = check(inst.Out, m)
			m[pc] = m[inst.Out]
			// pass matching runes back through these no-ops.
			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
			inst.Next = []uint32{}
			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
				inst.Next = append(inst.Next, inst.Out)
			}
		case syntax.InstEmptyWidth:
			ok = check(inst.Out, m)
			m[pc] = m[inst.Out]
			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
			inst.Next = []uint32{}
			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
				inst.Next = append(inst.Next, inst.Out)
			}
		case syntax.InstMatch, syntax.InstFail:
			m[pc] = inst.Op == syntax.InstMatch
			break
		case syntax.InstRune:
			m[pc] = false
			if len(inst.Next) > 0 {
				break
			}
			instQueue.insert(inst.Out)
			if len(inst.Rune) == 0 {
				onePassRunes[pc] = []rune{}
				inst.Next = []uint32{inst.Out}
				break
			}
			runes := make([]rune, 0)
			if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
				r0 := inst.Rune[0]
				runes = append(runes, r0, r0)
				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
					runes = append(runes, r1, r1)
				}
				sort.Sort(runeSlice(runes))
			} else {
				runes = append(runes, inst.Rune...)
			}
			onePassRunes[pc] = runes
			inst.Next = []uint32{}
			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
				inst.Next = append(inst.Next, inst.Out)
			}
			inst.Op = syntax.InstRune
		case syntax.InstRune1:
			m[pc] = false
			if len(inst.Next) > 0 {
				break
			}
			instQueue.insert(inst.Out)
			runes := []rune{}
			// expand case-folded runes
			if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
				r0 := inst.Rune[0]
				runes = append(runes, r0, r0)
				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
					runes = append(runes, r1, r1)
				}
				sort.Sort(runeSlice(runes))
			} else {
				runes = append(runes, inst.Rune[0], inst.Rune[0])
			}
			onePassRunes[pc] = runes
			inst.Next = []uint32{}
			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
				inst.Next = append(inst.Next, inst.Out)
			}
			inst.Op = syntax.InstRune
		case syntax.InstRuneAny:
			m[pc] = false
			if len(inst.Next) > 0 {
				break
			}
			instQueue.insert(inst.Out)
			onePassRunes[pc] = append([]rune{}, anyRune...)
			inst.Next = []uint32{inst.Out}
		case syntax.InstRuneAnyNotNL:
			m[pc] = false
			if len(inst.Next) > 0 {
				break
			}
			instQueue.insert(inst.Out)
			onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
			inst.Next = []uint32{}
			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
				inst.Next = append(inst.Next, inst.Out)
			}
		}
		return
	}

	instQueue.clear()
	instQueue.insert(uint32(p.Start))
	m := make(map[uint32]bool, len(p.Inst))
	for !instQueue.empty() {
		visitQueue.clear()
		pc := instQueue.next()
		if !check(uint32(pc), m) {
			p = notOnePass
			break
		}
	}
	if p != notOnePass {
		for i := range p.Inst {
			p.Inst[i].Rune = onePassRunes[i]
		}
	}
	return p
}