This repository has been archived by the owner on Nov 22, 2018. It is now read-only.
/
lex.go
383 lines (355 loc) · 10.6 KB
/
lex.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
// Copyright (c) 2014 The lex Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package lex provides support for a *nix (f)lex like tool on .l sources.
// The syntax is similar to a subset of (f)lex, see also: http://flex.sourceforge.net/manual/Format.html#Format
//
// Changelog
//
// 2014-11-18: Add option for marking an accepting state. Required to support
// POSIX longest match.
//
// Some feature examples:
//
// /* Unindented multiline Go comments in the definitions section */
//
// Any indented text in the definitions section
//
// %{
// Any text in the definitions section within %{ and %}
// %}
//
// D [0-9]
//
// %s non-exclusive-start-condition s2 s3
//
// %x exclusive-start-condition e2
//
// %yyt getTopState() // not required when only INITIAL start condition exists
// %yyb last == '\n' || last = '\0'
// %yyc getCurrentChar()
// %yyn move() // get next character
// %yym mark() // now in accepting state
//
// %%
// Indented text before the first rule is presumably treated specially (renderer specific)
//
// {D}+ return(INT)
//
// {D}+\.{D}+
// return(FLOAT)
//
// [a-z][a-z0-9]+
// /* identifier found */
// return(IDENT)
//
// A"[foo]\"bar"Z println(`A[foo]"barZ`)
//
// ^bol|eol$
//
// <non-exclusive-start-condition>foo
// %{
// println("foo found")
// %}
//
// <s2,s3>bar
//
// <INITIAL,e2>abc
//
// <*>"always" println("active in all start conditions")
//
// %%
// The optional user code section. Possibly the place where a lexem recognition fail will
// be handled (renderer specific).
// Missing/differing functionality of the .l parser/FSM generator (compared to flex):
// - Trailing context (re1/re2).
// - No requirement of an action to start on the same line as the pattern.
// - Processing of actions enclosed in braces. This package mostly treats
// any non blank text following a pattern up to the next pattern as an action source code.
// - All flex % prefixed options except %s and %x.
// - Flex incompatible %yy* options
// - No cclasses ([[:digit:]]).
// - Anything special after '(?'.
// - Matching <<EOF>>. Still \0 is OK in a pattern.
// - And probably more.
package lex
import (
"bytes"
"errors"
"fmt"
"github.com/cznic/lexer"
"go/token"
"io"
"sort"
"strings"
"github.com/cznic/fileutil"
)
type rule struct {
conds []string
pattern, re, action string
in, bolIn *lexer.NfaState
bol, eol bool
}
var (
defs = map[string]string{}
defPos = map[string]token.Position{}
defCode []string
defRE = map[string]string{}
errors_ []string
rules = []rule{{}}
rulePos = []token.Position{{}}
unreachableRules = map[int]bool{}
usrCode string
sStarts = []string{"INITIAL"}
xStarts []string
isXStart = map[string]bool{}
iStarts = map[string]int{"INITIAL": 0}
defStarts = map[string]bool{"INITIAL": true}
unrefStarts = map[string]bool{}
allDfa *dfa
_yyt = "yyt"
_yyb = "yyb"
_yyc = "yyc"
_yyn = "yyn"
_yym = "yym"
nodfaopt bool
bits32 bool // enables unicode rune processing, standard is byte
caseless bool
)
func logErr(s string) {
errors_ = append(errors_, s)
}
// Rule represents data for a pattern/action
type Rule struct {
Conds []string // Start conditions of the rule
Pattern string // Original rule's pattern
BOL bool // Pattern starts with beginning of line assertion (^)
EOL bool // Pattern ends wih end of line ($) assertion
RE string // Pattern translated to a regular expression
Action string // Rule's associated action source code
}
// L represents selected data structures found in / generated from a .l source.
// A [command line] tool using this package may then render L to some
// programming language source code and/or data table(s).
type L struct {
// Source code lines for rendering from the definitions section
DefCode []string
// Names of declared start conditions with their respective numeric
// identificators
StartConditions map[string]int
// Start conditions numeric identificators with their respective DFA
// start state
StartConditionsStates map[int]*lexer.NfaState
// Beginnig of line start conditions numeric identificators with their
// respective DFA start state
StartConditionsBolStates map[int]*lexer.NfaState
// Rule[0] is a pseudo rule. It's action contains the source code for
// rendering from the rules section before firts rule
Rules []Rule
// The generated FSM
Dfa lexer.Nfa
// Accept states with their respective rule index
Accepts map[*lexer.NfaState]int
// Source code for rendering from the user code section
UserCode string
// Source code for rendering of get_current_start_condition. Set by
// %yyt.
YYT string
// Source code for rendering of get_bol, i.e. if we are at the
// beginning of line right now. Set by %yyb.
YYB string
// Source code for rendering of get_peek_char, i.e. the char the lexer
// will now consider in making of a decision. Set by %yyc.
YYC string
// Source code for rendering of move_to_next_char, i.e. "consume" the
// current peek char and go to the next one. Set by %yyn.
YYN string
// Source code for rendering of mark_accepting, support to accept
// longest matching but reusing the "overflowed" input. Set by %yym.
YYM string
}
// DfaString returns the textual representation of the Dfa field.
func (l *L) DfaString() string {
buf := bytes.NewBuffer(nil)
buf.WriteString("StartConditions:\n")
// Stabilize
a := []string{}
for name := range l.StartConditions {
a = append(a, name)
}
sort.Strings(a)
for _, name := range a {
id := l.StartConditions[name]
if l.StartConditionsStates[id] != nil {
buf.WriteString(fmt.Sprintf("\t%s, scId:%d, stateId:%d\n", name, id, l.StartConditionsStates[id].Index))
}
if l.StartConditionsBolStates[id] != nil {
buf.WriteString(fmt.Sprintf("\t^%s, scId:%d, stateId:%d\n", name, id, l.StartConditionsBolStates[id].Index))
}
}
buf.WriteString(fmt.Sprintf("DFA:%s\n", l.Dfa.String()))
// Stabilize
as, ar := []int{}, map[int]int{}
for state, rule := range l.Accepts {
i := int(state.Index)
as = append(as, i)
ar[i] = rule
}
sort.Ints(as)
for _, state := range as {
rule := ar[state]
buf.WriteString(fmt.Sprintf("state %d accepts rule %d\n", state, rule))
}
return buf.String()
}
func (l *L) String() string {
buf := bytes.NewBuffer(nil)
if s := l.DefCode; len(s) != 0 {
buf.WriteString(fmt.Sprintf("DefCode: %q\n", strings.Join(s, "")))
}
for id, rule := range l.Rules {
if id == 0 && rule.Action == "" {
continue
}
buf.WriteString(fmt.Sprintf("Rule %d\n", id))
if s := rule.Conds; len(s) != 0 {
buf.WriteString(fmt.Sprintf("\tsc:<%s>\n", strings.Join(s, ",")))
}
if s := rule.Pattern; s != "" {
buf.WriteString(fmt.Sprintf("\tpattern:`%s`\n", s))
if rule.BOL || rule.EOL {
buf.WriteString("\tasserts: ")
if rule.BOL {
buf.WriteString("BOL ")
}
if rule.EOL {
buf.WriteString("EOL")
}
buf.WriteString("\n")
}
buf.WriteString(fmt.Sprintf("\tre:`%s`\n", rule.RE))
}
if s := rule.Action; s != "" {
buf.WriteString(fmt.Sprintf("\taction:%q\n", s))
}
}
if s := l.YYT; s != "" {
buf.WriteString(fmt.Sprintf("YYT: `%s`\n", s))
}
if s := l.YYB; s != "" {
buf.WriteString(fmt.Sprintf("YYB: `%s`\n", s))
}
if s := l.YYC; s != "" {
buf.WriteString(fmt.Sprintf("YYC: `%s`\n", s))
}
if s := l.YYN; s != "" {
buf.WriteString(fmt.Sprintf("YYN: `%s`\n", s))
}
buf.WriteString(l.DfaString())
if s := l.UserCode; s != "" {
buf.WriteString(fmt.Sprintf("UserCode: %q\n", s))
}
return buf.String()
}
var hook bool
// NewL parses a .l source fname from src, returns L or an error if any.
// Currently it is not reentrant and not invokable more than once in an application
// (which is assumed tolerable for a "lex" tool).
// The unoptdfa argument allows to disable optimization of the produced DFA.
// The mode32 parameter is not yet supported and must be false.
func NewL(fname string, src io.RuneReader, unoptdfa, mode32 bool) (l *L, err error) {
if mode32 {
return nil, errors.New("lex.NewL: mode32 unsupported yet")
}
nodfaopt, bits32 = unoptdfa, mode32
l = &L{}
if !hook {
defer func() {
if e := recover(); e != nil {
l = nil
err = e.(error)
}
}()
}
// Support \r\n line separators too
in := []rune{}
loop:
for {
r, _, err := src.ReadRune()
switch {
case err == nil:
in = append(in, r)
case fileutil.IsEOF(err):
break loop
default:
return nil, err
}
}
src = bytes.NewBufferString(strings.Replace(string(in), "\r\n", "\n", -1))
scanner := lxr.Scanner(fname, src)
if y := yyParse(newTokenizer(scanner)); y != 0 || len(errors_) != 0 {
return nil, errors.New(strings.Join(errors_, "\n"))
}
computePartialDFAs()
if len(errors_) != 0 {
return nil, errors.New(strings.Join(errors_, "\n"))
}
computeAllNfa()
allDfa = allNfa.powerSet()
for _, irule := range allDfa.acceptRule {
delete(unreachableRules, irule)
}
for irule := range unreachableRules {
logErr(fmt.Sprintf("%s - pattern `%s` unreachable", rulePos[irule], rules[irule].pattern))
}
if len(errors_) != 0 {
return nil, errors.New(strings.Join(errors_, "\n"))
}
l.DefCode = defCode
l.UserCode = usrCode
l.StartConditions = iStarts
l.StartConditionsStates = make(map[int]*lexer.NfaState)
l.StartConditionsBolStates = make(map[int]*lexer.NfaState)
for _, edge0 := range allDfa.nfa.in.Consuming {
switch edge := edge0.(type) {
default:
panic(errors.New("internal error"))
case *lexer.RuneEdge:
if _, ok := l.StartConditionsStates[int(edge.Rune)]; ok {
panic(errors.New("internal error"))
}
if edge.Rune < 128 {
l.StartConditionsStates[int(edge.Rune)] = edge.Target()
} else {
l.StartConditionsBolStates[int(edge.Rune)-128] = edge.Target()
}
case *lexer.RangesEdge:
for _, rng := range edge.Ranges.R32 {
for arune := rng.Lo; arune <= rng.Hi; arune += rng.Stride {
if _, ok := l.StartConditionsStates[int(arune)]; ok {
panic(errors.New("internal error"))
}
if arune < 128 {
l.StartConditionsStates[int(arune)] = edge.Target()
} else {
l.StartConditionsBolStates[int(arune)-128] = edge.Target()
}
}
}
}
}
for _, rule := range rules {
l.Rules = append(l.Rules, Rule{Conds: rule.conds, Pattern: rule.pattern, RE: rule.re, Action: rule.action, BOL: rule.bol, EOL: rule.eol})
}
l.Dfa = allDfa.nfa.nfa[1:]
l.Accepts = map[*lexer.NfaState]int{}
for id, state := range allDfa.accept {
l.Accepts[state] = allDfa.acceptRule[id]
}
l.YYT = _yyt
l.YYB = _yyb
l.YYC = _yyc
l.YYN = _yyn
l.YYM = _yym
return
}