forked from cznic/lexer
/
lexer.go
418 lines (371 loc) · 11.1 KB
/
lexer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
// Copyright (c) 2011 CZ.NIC z.s.p.o. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// blame:
// jnml, labs.nic.cz
// Miek Gieben, SIDN, miek@miek.nl
//TODO(jnml) complete package docs
/*
Package lexer provides generating actionless scanners (lexeme recognizers) at run time.
Scanners are defined by regular expressions
and/or lexical grammars, mapping between those definitions, token numeric identifiers and
an optional set of starting id sets, providing simmilar functionality as switching start states in *nix LEX.
The generated FSMs are Unicode arune based and all unicode.Categories and unicode.Scripts are supported by the
regexp syntax using the \p{name} construct.
Syntax supported by ParseRE (ATM a very basic subset of RE2, docs bellow are a mod of: http://code.google.com/p/re2/wiki/Syntax, original docs license unclear)
Single characters:
. any character, excluding newline
[xyz] character class
[^xyz] negated character class
\p{Greek} Unicode character class
\P{Greek} negated Unicode character class
Composites:
xy x followed by y
x|y x or y
Repetitions:
x* zero or more x
x+ one or more x
x? zero or one x
Grouping:
(re) group
Empty strings:
^ at beginning of text or line
$ at end of text or line
\A at beginning of text
\z at end of text
Escape sequences:
\a bell (≡ \007)
\b backspace (≡ \010)
\f form feed (≡ \014)
\n newline (≡ \012)
\r carriage return (≡ \015)
\t horizontal tab (≡ \011)
\v vertical tab character (≡ \013)
\M M is one of metachars \.+*?()|[]^$
\xhh arune \u00hh, h is a hex digit
Character class elements:
x single Unicode character
A-Z Unicode character range (inclusive)
Unicode character class names--general category:
Cc control
Cf format
Co private use
Cs surrogate
letter Lu, Ll, Lt, Lm, or Lo
Ll lowercase letter
Lm modifier letter
Lo other letter
Lt titlecase letter
Lu uppercase letter
Mc spacing mark
Me enclosing mark
Mn non-spacing mark
Nd decimal number
Nl letter number
No other number
Pc connector punctuation
Pd dash punctuation
Pe close punctuation
Pf final punctuation
Pi initial punctuation
Po other punctuation
Ps open punctuation
Sc currency symbol
Sk modifier symbol
Sm math symbol
So other symbol
Zl line separator
Zp paragraph separator
Zs space separator
Unicode character class names--scripts:
Arabic Arabic
Armenian Armenian
Avestan Avestan
Balinese Balinese
Bamum Bamum
Bengali Bengali
Bopomofo Bopomofo
Braille Braille
Buginese Buginese
Buhid Buhid
Canadian_Aboriginal Canadian Aboriginal
Carian Carian
Common Common
Coptic Coptic
Cuneiform Cuneiform
Cypriot Cypriot
Cyrillic Cyrillic
Deseret Deseret
Devanagari Devanagari
Egyptian_Hieroglyphs Egyptian Hieroglyphs
Ethiopic Ethiopic
Georgian Georgian
Glagolitic Glagolitic
Gothic Gothic
Greek Greek
Gujarati Gujarati
Gurmukhi Gurmukhi
Hangul Hangul
Han Han
Hanunoo Hanunoo
Hebrew Hebrew
Hiragana Hiragana
Cham Cham
Cherokee Cherokee
Imperial_Aramaic Imperial Aramaic
Inherited Inherited
Inscriptional_Pahlavi Inscriptional Pahlavi
Inscriptional_Parthian Inscriptional Parthian
Javanese Javanese
Kaithi Kaithi
Kannada Kannada
Katakana Katakana
Kayah_Li Kayah Li
Kharoshthi Kharoshthi
Khmer Khmer
Lao Lao
Latin Latin
Lepcha Lepcha
Limbu Limbu
Linear_B Linear B
Lisu Lisu
Lycian Lycian
Lydian Lydian
Malayalam Malayalam
Meetei_Mayek Meetei Mayek
Mongolian Mongolian
Myanmar Myanmar
New_Tai_Lue New Tai Lue
Nko Nko
Ogham Ogham
Old_Italic Old Italic
Old_Persian Old Persian
Old_South_Arabian Old South Arabian
Old_Turkic Old Turkic
Ol_Chiki Ol Chiki
Oriya Oriya
Osmanya Osmanya
Phags_Pa Phags Pa
Phoenician Phoenician
Rejang Rejang
Runic Runic
Samaritan Samaritan
Saurashtra Saurashtra
Shavian Shavian
Sinhala Sinhala
Sundanese Sundanese
Syloti_Nagri Syloti Nagri
Syriac Syriac
Tagalog Tagalog
Tagbanwa Tagbanwa
Tai_Le Tai Le
Tai_Tham Tai Tham
Tai_Viet Tai Viet
Tamil Tamil
Telugu Telugu
Thaana Thaana
Thai Thai
Tibetan Tibetan
Tifinagh Tifinagh
Ugaritic Ugaritic
Vai Vai
Yi Yi
*/
package lexer
import (
"bytes"
"github.com/cznic/ebnf"
"fmt"
"go/scanner"
"io"
"log"
"os"
"regexp"
"strconv"
"unicode"
)
type Lexer struct {
nfa Nfa
starts []*NfaState
accept *NfaState
}
// StartSetID is a type of a lexer start set identificator.
// It is used by Begin and PushState.
type StartSetID int
//TODO:full docs
func CompileLexer(starts [][]int, tokdefs map[string]int, grammar, start string) (lexer *Lexer, err error) {
lexer = &Lexer{}
defer func() {
if e := recover(); e != nil {
lexer = nil
err = e.(error)
}
}()
var prodnames string
res, xref := map[int]string{}, map[int]string{}
for tokdef, id := range tokdefs {
if _, ok := res[id]; ok {
panic(fmt.Errorf("duplicate id %d for token %q", id, tokdef))
}
xref[id] = fmt.Sprintf("id-%d", id)
if re, ok := isRE(tokdef); ok {
res[id] = re
continue
}
if grammar == "" || !isIdent(tokdef) {
res[id] = regexp.QuoteMeta(tokdef)
continue
}
if prodnames != "" {
prodnames += " | "
}
prodnames += tokdef
res[id] = ""
}
if prodnames != "" {
var g ebnf.Grammar
ebnfSrc := grammar + fmt.Sprintf("\n%s = %s .", start, prodnames)
if g, err = ebnf.Parse(start, bytes.NewBufferString(ebnfSrc)); err != nil {
panic(err)
}
if err = ebnf.Verify(g, start); err != nil {
panic(err)
}
grammarREs := map[*ebnf.Production]string{}
for tokdef, id := range tokdefs {
if isIdent(tokdef) {
res[id], xref[id] = ebnf2RE(g, tokdef, grammarREs), tokdef
}
}
}
if starts == nil { // create the default, all inclusive start set
starts = [][]int{{}}
for id := range res {
starts[0] = append(starts[0], id)
}
}
lexer.accept = lexer.nfa.NewState()
lexer.starts = make([]*NfaState, len(starts))
for i, set := range starts {
state := lexer.nfa.NewState()
lexer.starts[i] = state
for _, id := range set {
var in, out *NfaState
re, ok := res[int(id)]
if !ok {
panic(fmt.Errorf("unknown token id %d in set %d", id, i))
}
if in, out, err = lexer.nfa.ParseRE(fmt.Sprintf("%s-%s", start, xref[int(id)]), re); err != nil {
panic(err)
}
state.AddNonConsuming(&EpsilonEdge{int(id), in})
out.AddNonConsuming(&EpsilonEdge{0, lexer.accept})
}
}
lexer.nfa.reduce()
return
}
// MustCompileLexer is like CompileLexer but panics if the definitions cannot be compiled.
// It simplifies safe initialization of global variables holding compiled Lexers.
func MustCompileLexer(starts [][]int, tokdefs map[string]int, grammar, start string) (lexer *Lexer) {
var err error
if lexer, err = CompileLexer(starts, tokdefs, grammar, start); err != nil {
if list, ok := err.(scanner.ErrorList); ok {
scanner.PrintError(os.Stderr, list)
} else {
log.Fatal(err)
}
panic(err)
}
return
}
func (lx *Lexer) String() (s string) {
s = lx.nfa.String()
for i, set := range lx.starts {
s += fmt.Sprintf("\nstart set %d = {", i)
for _, edge := range set.NonConsuming {
s += " " + strconv.Itoa(int(edge.Target().Index))
}
s += " }"
}
s += "\naccept: " + strconv.Itoa(int(lx.accept.Index))
return
}
func identFirst(arune rune) bool {
return unicode.IsLetter(arune) || arune == '_'
}
func identNext(arune rune) bool {
return identFirst(arune) || unicode.IsDigit(arune)
}
func isIdent(s string) bool {
for i, arune := range s {
if i == 0 && !identFirst(arune) {
return false
}
if !identNext(arune) {
return false
}
}
return true
}
// isRE checks if a string starts and ends in '/'. If so, return the string w/o the leading and trailing '/' and true.
// Otherwise return the original string and false.
func isRE(s string) (string, bool) {
if n := len(s); n > 2 && s[0] == '/' && s[n-1] == '/' {
return s[1 : n-1], true
}
return s, false
}
var pipe = map[bool]string{false: "", true: "|"}
func ebnf2RE(g ebnf.Grammar, name string, res map[*ebnf.Production]string) (re string) {
p := g[name]
if r, ok := res[p]; ok {
return r
}
buf := bytes.NewBuffer(nil)
var compile func(string, interface{}, string)
compile = func(pre string, item interface{}, post string) {
buf.WriteString(pre)
switch x := item.(type) {
default:
panic(fmt.Errorf("unexpected type %T", x))
case ebnf.Alternative:
for i, item := range x {
compile(pipe[i > 0], item, "")
}
case *ebnf.Group:
compile("(", x.Body, ")")
case *ebnf.Name:
buf.WriteString("(" + ebnf2RE(g, x.String, res) + ")")
case *ebnf.Option:
compile("(", x.Body, ")?")
case *ebnf.Range:
buf.WriteString(fmt.Sprintf("[%s-%s]", regexp.QuoteMeta(x.Begin.String), regexp.QuoteMeta(x.End.String)))
case *ebnf.Repetition:
compile("(", x.Body, ")*")
case ebnf.Sequence:
for _, item := range x {
compile("", item, "")
}
case *ebnf.Token:
if s, ok := isRE(x.String); ok {
buf.WriteString(s)
} else {
buf.WriteString(regexp.QuoteMeta(s))
}
}
buf.WriteString(post)
}
compile("", p.Expr, "")
re = buf.String()
res[p] = re
return
}
// Scanner returns a new Scanner which can run the Lexer FSM. A Scanner is not safe for concurent access
// but many Scanners can safely share the same Lexer.
//
// The RuneReader can be nil. Then an EOFReader is supplied and
// the real RuneReader(s) can be Included anytime afterwards.
func (lx *Lexer) Scanner(fname string, r io.RuneReader) *Scanner {
return newScanner(lx, NewScannerSource(fname, r))
}