Example #1
0
// OnePassPrefix returns a literal string that all matches for the
// regexp must start with.  Complete is true if the prefix
// is the entire match. Pc is the index of the last rune instruction
// in the string. The OnePassPrefix skips over the mandatory
// EmptyBeginText
func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
	i := &p.Inst[p.Start]
	if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
		return "", i.Op == syntax.InstMatch, uint32(p.Start)
	}
	pc = i.Out
	i = &p.Inst[pc]
	for i.Op == syntax.InstNop {
		pc = i.Out
		i = &p.Inst[pc]
	}
	// Avoid allocation of buffer if prefix is empty.
	if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
		return "", i.Op == syntax.InstMatch, uint32(p.Start)
	}

	// Have prefix; gather characters.
	var buf bytes.Buffer
	for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
		buf.WriteRune(i.Rune[0])
		pc, i = i.Out, &p.Inst[i.Out]
	}
	if i.Op == syntax.InstEmptyWidth &&
		syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 &&
		p.Inst[i.Out].Op == syntax.InstMatch {
		complete = true
	}
	return buf.String(), complete, pc
}
Example #2
0
// backtrack runs a backtracking search of prog on the input starting at pos.
func (m *machine) backtrack(i input, pos int, end int, ncap int) bool {
	if !i.canCheckPrefix() {
		panic("backtrack called for a RuneReader")
	}

	startCond := m.re.cond
	if startCond == ^syntax.EmptyOp(0) { // impossible
		return false
	}
	if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
		// Anchored match, past beginning of text.
		return false
	}

	b := m.b
	b.reset(end, ncap)

	m.matchcap = m.matchcap[:ncap]
	for i := range m.matchcap {
		m.matchcap[i] = -1
	}

	// Anchored search must start at the beginning of the input
	if startCond&syntax.EmptyBeginText != 0 {
		if len(b.cap) > 0 {
			b.cap[0] = pos
		}
		return m.tryBacktrack(b, i, uint32(m.p.Start), pos)
	}

	// Unanchored search, starting from each possible text position.
	// Notice that we have to try the empty string at the end of
	// the text, so the loop condition is pos <= end, not pos < end.
	// This looks like it's quadratic in the size of the text,
	// but we are not clearing visited between calls to TrySearch,
	// so no work is duplicated and it ends up still being linear.
	width := -1
	for ; pos <= end && width != 0; pos += width {
		if len(m.re.prefix) > 0 {
			// Match requires literal prefix; fast search for it.
			advance := i.index(m.re, pos)
			if advance < 0 {
				return false
			}
			pos += advance
		}

		if len(b.cap) > 0 {
			b.cap[0] = pos
		}
		if m.tryBacktrack(b, i, uint32(m.p.Start), pos) {
			// Match must be leftmost; done.
			return true
		}
		_, width = i.step(pos)
	}
	return false
}
Example #3
0
// compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
// can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the
// Prog cannot be converted. For a one pass prog, the fundamental condition that must
// be true is: at any InstAlt, there must be no ambiguity about what branch to  take.
func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
	if prog.Start == 0 {
		return notOnePass
	}
	// onepass regexp is anchored
	if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
		syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
		return notOnePass
	}
	// every instruction leading to InstMatch must be EmptyEndText
	for _, inst := range prog.Inst {
		opOut := prog.Inst[inst.Out].Op
		switch inst.Op {
		default:
			if opOut == syntax.InstMatch {
				return notOnePass
			}
		case syntax.InstAlt, syntax.InstAltMatch:
			if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
				return notOnePass
			}
		case syntax.InstEmptyWidth:
			if opOut == syntax.InstMatch {
				if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
					continue
				}
				return notOnePass
			}
		}
	}
	// Creates a slightly optimized copy of the original Prog
	// that cleans up some Prog idioms that block valid onepass programs
	p = onePassCopy(prog)

	// checkAmbiguity on InstAlts, build onepass Prog if possible
	p = makeOnePass(p)

	if p != notOnePass {
		cleanupOnePass(p, prog)
	}
	return p
}
Example #4
0
File: exec.go Project: agext/regexp
// add adds an entry to q for pc, unless the q already has such an entry.
// It also recursively adds an entry for all instructions reachable from pc by following
// empty-width conditions satisfied by cond.  pos gives the current position
// in the input.
func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp, t *thread) *thread {
	if pc == 0 {
		return t
	}
	if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc {
		return t
	}

	j := len(q.dense)
	q.dense = q.dense[:j+1]
	d := &q.dense[j]
	d.t = nil
	d.pc = pc
	q.sparse[pc] = uint32(j)

	i := &m.p.Inst[pc]
	switch i.Op {
	default:
		panic("unhandled")
	case syntax.InstFail:
		// nothing
	case syntax.InstAlt, syntax.InstAltMatch:
		t = m.add(q, i.Out, pos, cap, cond, t)
		t = m.add(q, i.Arg, pos, cap, cond, t)
	case syntax.InstEmptyWidth:
		if syntax.EmptyOp(i.Arg)&^cond == 0 {
			t = m.add(q, i.Out, pos, cap, cond, t)
		}
	case syntax.InstNop:
		t = m.add(q, i.Out, pos, cap, cond, t)
	case syntax.InstCapture:
		if int(i.Arg) < len(cap) {
			opos := cap[i.Arg]
			cap[i.Arg] = pos
			m.add(q, i.Out, pos, cap, cond, nil)
			cap[i.Arg] = opos
		} else {
			t = m.add(q, i.Out, pos, cap, cond, t)
		}
	case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
		if t == nil {
			t = m.alloc(i)
		} else {
			t.inst = i
		}
		if len(cap) > 0 && &t.cap[0] != &cap[0] {
			copy(t.cap, cap)
		}
		d.t = t
		t = nil
	}
	return t
}
Example #5
0
File: exec.go Project: agext/regexp
// onepass runs the machine over the input starting at pos.
// It reports whether a match was found.
// If so, m.matchcap holds the submatch information.
func (m *machine) onepass(i input, pos int) bool {
	startCond := m.re.cond
	if startCond == ^syntax.EmptyOp(0) { // impossible
		return false
	}
	m.matched = false
	for i := range m.matchcap {
		m.matchcap[i] = -1
	}
	r, r1 := endOfText, endOfText
	width, width1 := 0, 0
	r, width = i.step(pos)
	if r != endOfText {
		r1, width1 = i.step(pos + width)
	}
	var flag syntax.EmptyOp
	if pos == 0 {
		flag = syntax.EmptyOpContext(-1, r)
	} else {
		flag = i.context(pos)
	}
	pc := m.op.Start
	inst := m.op.Inst[pc]
	// If there is a simple literal prefix, skip over it.
	if pos == 0 && syntax.EmptyOp(inst.Arg)&^flag == 0 &&
		len(m.re.prefix) > 0 && i.canCheckPrefix() {
		// Match requires literal prefix; fast search for it.
		if i.hasPrefix(m.re) {
			pos += len(m.re.prefix)
			r, width = i.step(pos)
			r1, width1 = i.step(pos + width)
			flag = i.context(pos)
			pc = int(m.re.prefixEnd)
		} else {
			return m.matched
		}
	}
	for {
		inst = m.op.Inst[pc]
		pc = int(inst.Out)
		switch inst.Op {
		default:
			panic("bad inst")
		case syntax.InstMatch:
			m.matched = true
			if len(m.matchcap) > 0 {
				m.matchcap[0] = 0
				m.matchcap[1] = pos
			}
			return m.matched
		case syntax.InstRune:
			if !inst.MatchRune(r) {
				return m.matched
			}
		case syntax.InstRune1:
			if r != inst.Rune[0] {
				return m.matched
			}
		case syntax.InstRuneAny:
			// Nothing
		case syntax.InstRuneAnyNotNL:
			if r == '\n' {
				return m.matched
			}
		// peek at the input rune to see which branch of the Alt to take
		case syntax.InstAlt, syntax.InstAltMatch:
			pc = int(onePassNext(&inst, r))
			continue
		case syntax.InstFail:
			return m.matched
		case syntax.InstNop:
			continue
		case syntax.InstEmptyWidth:
			if syntax.EmptyOp(inst.Arg)&^flag != 0 {
				return m.matched
			}
			continue
		case syntax.InstCapture:
			if int(inst.Arg) < len(m.matchcap) {
				m.matchcap[inst.Arg] = pos
			}
			continue
		}
		if width == 0 {
			break
		}
		flag = syntax.EmptyOpContext(r, r1)
		pos += width
		r, width = r1, width1
		if r != endOfText {
			r1, width1 = i.step(pos + width)
		}
	}
	return m.matched
}
Example #6
0
File: exec.go Project: agext/regexp
// match runs the machine over the input starting at pos.
// It reports whether a match was found.
// If so, m.matchcap holds the submatch information.
func (m *machine) match(i input, pos int) bool {
	startCond := m.re.cond
	if startCond == ^syntax.EmptyOp(0) { // impossible
		return false
	}
	m.matched = false
	for i := range m.matchcap {
		m.matchcap[i] = -1
	}
	runq, nextq := &m.q0, &m.q1
	r, r1 := endOfText, endOfText
	width, width1 := 0, 0
	r, width = i.step(pos)
	if r != endOfText {
		r1, width1 = i.step(pos + width)
	}
	var flag syntax.EmptyOp
	if pos == 0 {
		flag = syntax.EmptyOpContext(-1, r)
	} else {
		flag = i.context(pos)
	}
	for {
		if len(runq.dense) == 0 {
			if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
				// Anchored match, past beginning of text.
				break
			}
			if m.matched {
				// Have match; finished exploring alternatives.
				break
			}
			if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() {
				// Match requires literal prefix; fast search for it.
				advance := i.index(m.re, pos)
				if advance < 0 {
					break
				}
				pos += advance
				r, width = i.step(pos)
				r1, width1 = i.step(pos + width)
			}
		}
		if !m.matched {
			if len(m.matchcap) > 0 {
				m.matchcap[0] = pos
			}
			m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag, nil)
		}
		flag = syntax.EmptyOpContext(r, r1)
		m.step(runq, nextq, pos, pos+width, r, flag)
		if width == 0 {
			break
		}
		if len(m.matchcap) == 0 && m.matched {
			// Found a match and not paying attention
			// to where it is, so any match will do.
			break
		}
		pos += width
		r, width = r1, width1
		if r != endOfText {
			r1, width1 = i.step(pos + width)
		}
		runq, nextq = nextq, runq
	}
	m.clear(nextq)
	return m.matched
}
Example #7
0
// tryBacktrack runs a backtracking search starting at pos.
func (m *machine) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool {
	longest := m.re.longest
	m.matched = false

	b.push(pc, pos, 0)
	for len(b.jobs) > 0 {
		l := len(b.jobs) - 1
		// Pop job off the stack.
		pc := b.jobs[l].pc
		pos := b.jobs[l].pos
		arg := b.jobs[l].arg
		b.jobs = b.jobs[:l]

		// Optimization: rather than push and pop,
		// code that is going to Push and continue
		// the loop simply updates ip, p, and arg
		// and jumps to CheckAndLoop.  We have to
		// do the ShouldVisit check that Push
		// would have, but we avoid the stack
		// manipulation.
		goto Skip
	CheckAndLoop:
		if !b.shouldVisit(pc, pos) {
			continue
		}
	Skip:

		inst := b.prog.Inst[pc]

		switch inst.Op {
		default:
			panic("bad inst")
		case syntax.InstFail:
			panic("unexpected InstFail")
		case syntax.InstAlt:
			// Cannot just
			//   b.push(inst.Out, pos, 0)
			//   b.push(inst.Arg, pos, 0)
			// If during the processing of inst.Out, we encounter
			// inst.Arg via another path, we want to process it then.
			// Pushing it here will inhibit that. Instead, re-push
			// inst with arg==1 as a reminder to push inst.Arg out
			// later.
			switch arg {
			case 0:
				b.push(pc, pos, 1)
				pc = inst.Out
				goto CheckAndLoop
			case 1:
				// Finished inst.Out; try inst.Arg.
				arg = 0
				pc = inst.Arg
				goto CheckAndLoop
			}
			panic("bad arg in InstAlt")

		case syntax.InstAltMatch:
			// One opcode consumes runes; the other leads to match.
			switch b.prog.Inst[inst.Out].Op {
			case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
				// inst.Arg is the match.
				b.push(inst.Arg, pos, 0)
				pc = inst.Arg
				pos = b.end
				goto CheckAndLoop
			}
			// inst.Out is the match - non-greedy
			b.push(inst.Out, b.end, 0)
			pc = inst.Out
			goto CheckAndLoop

		case syntax.InstRune:
			r, width := i.step(pos)
			if !inst.MatchRune(r) {
				continue
			}
			pos += width
			pc = inst.Out
			goto CheckAndLoop

		case syntax.InstRune1:
			r, width := i.step(pos)
			if r != inst.Rune[0] {
				continue
			}
			pos += width
			pc = inst.Out
			goto CheckAndLoop

		case syntax.InstRuneAnyNotNL:
			r, width := i.step(pos)
			if r == '\n' || r == endOfText {
				continue
			}
			pos += width
			pc = inst.Out
			goto CheckAndLoop

		case syntax.InstRuneAny:
			r, width := i.step(pos)
			if r == endOfText {
				continue
			}
			pos += width
			pc = inst.Out
			goto CheckAndLoop

		case syntax.InstCapture:
			switch arg {
			case 0:
				if 0 <= inst.Arg && inst.Arg < uint32(len(b.cap)) {
					// Capture pos to register, but save old value.
					b.push(pc, b.cap[inst.Arg], 1) // come back when we're done.
					b.cap[inst.Arg] = pos
				}
				pc = inst.Out
				goto CheckAndLoop
			case 1:
				// Finished inst.Out; restore the old value.
				b.cap[inst.Arg] = pos
				continue

			}
			panic("bad arg in InstCapture")
			continue

		case syntax.InstEmptyWidth:
			if syntax.EmptyOp(inst.Arg)&^i.context(pos) != 0 {
				continue
			}
			pc = inst.Out
			goto CheckAndLoop

		case syntax.InstNop:
			pc = inst.Out
			goto CheckAndLoop

		case syntax.InstMatch:
			// We found a match. If the caller doesn't care
			// where the match is, no point going further.
			if len(b.cap) == 0 {
				m.matched = true
				return m.matched
			}

			// Record best match so far.
			// Only need to check end point, because this entire
			// call is only considering one start position.
			if len(b.cap) > 1 {
				b.cap[1] = pos
			}
			if !m.matched || (longest && pos > 0 && pos > m.matchcap[1]) {
				copy(m.matchcap, b.cap)
			}
			m.matched = true

			// If going for first match, we're done.
			if !longest {
				return m.matched
			}

			// If we used the entire text, no longer match is possible.
			if pos == b.end {
				return m.matched
			}

			// Otherwise, continue on in hope of a longer match.
			continue
		}
		panic("unreachable")
	}

	return m.matched
}