-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathvm.nim
More file actions
290 lines (264 loc) · 8.95 KB
/
vm.nim
File metadata and controls
290 lines (264 loc) · 8.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
#
#
# Lexim - The Lexer Generator for Nim
# (c) Copyright 2015 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## Translates the DFA into a bytecode and then runs the bytecode.
import nfa, strutils, intsets, regexprs
type
Instr* = distinct uint32
Opcode* = enum
opcRet, # return with some literal
opcTestSet, # test current character against bitset in data section
opcTestChar, # test current character against char embedded in instr
opcTJmp, # jump if comparison was true
opcBegin, # \A match
opcEnd, # \Z match
opcWordBound, # \b match
opcCaptureBegin # begin of capture '('
opcCaptureEnd # end of capture ')'
opcBackref # \\1 match
Bytecode* = object
code*: seq[Instr]
data*: seq[set[char]]
startAt*, captures*: int
template opcode*(x: Instr): Opcode = Opcode(x.uint32 and 0xff'u32)
template regBx*(x: Instr): int = (x.uint32 shr 16'u32).int
proc codeListing(c: Bytecode, result: var string, start=0; last = -1) =
# first iteration: compute all necessary labels:
var jumpTargets = initIntSet()
let last = if last < 0: c.code.len-1 else: min(last, c.code.len-1)
jumpTargets.incl(c.startAt)
for i in start..last:
let x = c.code[i]
if x.opcode == opcTJmp:
jumpTargets.incl(x.regBx)
result.addf("goto L$1:\n", c.startAt)
var i = start
while i <= last:
if i in jumpTargets: result.addf("L$1:\n", i)
let x = c.code[i]
result.add($i)
let opc = opcode(x)
case opc
of opcRet:
result.addf("\t$#\t$#\n", ($opc).substr(3), x.regBx)
of opcTestSet:
result.addf("\t$#\t$#\n", ($opc).substr(3), $c.data[x.regBx])
of opcTestChar:
result.addf("\t$#\t$#\n", ($opc).substr(3), $chr(x.regBx))
of opcTJmp:
result.addf("\t$#\tL$#\n", ($opc).substr(3), x.regBx)
of opcCaptureBegin, opcCaptureEnd, opcBackref:
result.addf("\t$#\tC$#\n", ($opc).substr(3), x.regBx)
of opcBegin, opcEnd, opcWordBound:
result.addf("\t$#\n", ($opc).substr(3))
inc i
proc echoCode*(c: Bytecode; start=0; last = -1) {.deprecated.} =
var buf = ""
codeListing(c, buf, start, last)
echo buf
proc genInstr(opc: Opcode; bx: int): Instr =
# `bx` must be signed and in the range [-32767, 32768]
const a = 0
doAssert bx >= -32767 and bx <= 32768
result = (opc.uint32 or a.uint32 shl 8'u32 or
bx.uint32 shl 16'u32).Instr
proc gABx(c: var Bytecode; opc: Opcode; bx: int) =
c.code.add(genInstr(opc, bx))
proc genData(c: var Bytecode; data: set[char]): int =
assert '\0' notin data
for i in 0 .. c.data.high:
if c.data[i] == data: return i
result = c.data.len
c.data.add data
proc genTest(res: var Bytecode; x: Alphabet; dest: int) =
case x.kind
of reChar:
gABx(res, opcTestChar, int(x.val))
gABx(res, opcTJmp, dest)
of reBegin:
gABx(res, opcBegin, 0)
gABx(res, opcTJmp, dest)
of reEnd:
gABx(res, opcEnd, 0)
gABx(res, opcTJmp, dest)
of reWordBoundary:
gABx(res, opcWordBound, genData(res, wordChars))
gABx(res, opcTJmp, dest)
of reWordBoundaryNot:
gABx(res, opcWordBound, genData(res, {'\1'..'\255'} - wordChars))
gABx(res, opcTJmp, dest)
else: discard
proc genCapture(res: var Bytecode; cs: Alphabet; dest: int) =
case cs.kind
of reCapture:
gABx(res, opcCaptureBegin, cs.val.int-1)
gABx(res, opcTJmp, dest)
inc res.captures
of reCaptureEnd:
gABx(res, opcCaptureEnd, cs.val.int-1)
gABx(res, opcTJmp, dest)
of reBackref:
gABx(res, opcBackref, cs.val.int-1)
gABx(res, opcTJmp, dest)
else: discard
proc genBytecode*(a: DFA; res: var Bytecode) =
var stateToLabel = newSeq[int](a.stateCount)
for src in countup(1, a.stateCount):
stateToLabel[src-1] = res.code.len
let rule = getRule(a, src)
for dest in allDests(a, src):
# this implements the rather strange
# "match longest but only sometimes" rule that regexes seem to have:
if rule == 0 or rule == getRule(a, dest):
let (list, cset) = allTransitions(a, src, dest)
for x in list: genCapture(res, x, dest)
if cset != {}:
gABx(res, opcTestSet, genData(res, cset))
gABx(res, opcTJmp, dest)
for x in list: genTest(res, x, dest)
if stateToLabel[src-1] != res.code.len or rule != 0:
# only generate 'ret' instruction when the state is not empty:
gABx(res, opcRet, rule)
# Fixup the TJmp instructions:
for i in 0 .. res.code.high:
let instr = res.code[i]
if opcode(instr) == opcTJmp:
res.code[i] = genInstr(opcTJmp, stateToLabel[regBx(instr)-1])
res.startAt = stateToLabel[a.startState-1]
type Action* = int #distinct range[1..32_000]
proc backrefMatch(input: string; sp: int; capture: (int, int)): bool =
var i = capture[0]
var k = sp
while true:
if i > capture[1]: return true
if k >= input.len or input[k] != input[i]: return false
inc k
inc i
proc execBytecode*(m: Bytecode; input: string;
captures: var seq[(int, int)],
start=0): tuple[a: Action, endPos: int] =
var pc = m.startAt
var sp = start
#var backtrack: seq[(int,int)]
while true:
let instr = m.code[pc]
let opc = instr.opcode
let arg = instr.regBx
if opc == opcTestSet:
# we *know* the next instruction is a TJmp:
let next = m.code[pc+1]
assert next.opcode == opcTJmp
if sp < input.len and input[sp] in m.data[arg]:
pc = next.regBx
inc sp
else:
inc pc, 2
elif opc == opcTestChar:
# we *know* the next instruction is a TJmp:
let next = m.code[pc+1]
assert next.opcode == opcTJmp
if sp < input.len and input[sp] == chr(arg):
pc = next.regBx
inc sp
else:
inc pc, 2
elif opc == opcRet:
#if arg == 0 and not backtrack.isNil and backtrack.len > 0:
# let (newPc, newSp) = backtrack.pop()
# pc = newPc
# sp = newSp
#else:
return (Action(arg), sp)
else:
case opc
of opcBegin:
if sp == start:
pc = m.code[pc+1].regBx
else:
inc pc, 2
of opcEnd:
if sp >= input.len:
pc = m.code[pc+1].regBx
else:
inc pc, 2
of opcWordBound:
if sp >= input.len or sp == start or input[sp] notin m.data[arg]:
pc = m.code[pc+1].regBx
else:
inc pc, 2
of opcCaptureBegin:
if captures.len <= arg: setLen(captures, arg+1)
captures[arg][0] = sp
captures[arg][1] = -2 # mark as still open
#if backtrack.isNil: backtrack = @[]
#backtrack.add((pc+2, sp))
pc = m.code[pc+1].regBx
of opcCaptureEnd:
captures[arg][1] = sp-1
pc = m.code[pc+1].regBx
of opcBackref:
if arg < captures.len and backrefMatch(input, sp, captures[arg]):
pc = m.code[pc+1].regBx
inc sp, captures[arg][1] - captures[arg][0] + 1
else:
inc pc, 2
else: assert false
proc findMacro(s: string): PRegExpr = nil
proc re*(regex: string; flags: set[RegexFlag] = {reExtended}): Bytecode =
let r = parseRegExpr(regex, findMacro, flags)
r.rule = 1
var n: NFA
regExprToNFA(r, n)
let alph = fullAlphabet(n)
var d, o: DFA
NFA_to_DFA(n, d, alph)
optimizeDFA(d, o, alph)
result.code = @[]
result.data = @[]
genBytecode(o, result)
proc matchLen*(input: string; r: Bytecode;
captures: var seq[(int, int)], start=0): int =
let (isMatch, len) = execBytecode(r, input, captures, start)
result = if isMatch <= 0: -1 else: len
proc match*(input: string; r: Bytecode;
captures: var seq[(int, int)]; start=0): bool =
let (isMatch, len) = execBytecode(r, input, captures, start)
result = isMatch > 0
proc matchLen*(input: string; r: Bytecode, start=0): int =
var captures: seq[(int, int)] = @[]
let (isMatch, len) = execBytecode(r, input, captures, start)
result = if isMatch <= 0: -1 else: len
proc match*(input: string; r: Bytecode; start=0): bool =
var captures: seq[(int, int)] = @[]
let (isMatch, len) = execBytecode(r, input, captures, start)
result = isMatch > 0
template `=~`*(s: string, pattern: Bytecode): untyped =
## This calls ``match`` with an implicit declared ``matches`` seq that
## can be used in the scope of the ``=~`` call:
##
## .. code-block:: nim
##
## if line =~ re"\s*(\w+)\s*\=\s*(\w+)":
## # matches a key=value pair:
## echo("Key: ", matches[0])
## echo("Value: ", matches[1])
## elif line =~ re"\s*(\#.*)":
## # matches a comment
## # note that the implicit ``matches`` array is different from the
## # ``matches`` array of the first branch
## echo("comment: ", matches[0])
## else:
## echo("syntax error")
##
var captures: seq[(int, int)] = @[]
when not declaredInScope(matches):
var matches {.inject.}: seq[string] = @[]
let m = match(s, pattern, captures)
for i in 0..high(captures):
matches.add substr(s, captures[i][0], captures[i][1])
m