-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexer.py
301 lines (245 loc) · 9.09 KB
/
lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
from baseclass import *
from keywords import *
from tools import *
#####################
# Lexer #
#####################
class Lexer:
"""
OCaml Lexer
Parameters
----------
text : str
string that will be tokenized
Methods
-------
get_next_token()
return the next token found in text
return EOF when all the text have been lexed
"""
# WARNING
# All eating methods using while loop to find a specific character
# should **always** check `self.current_char is not None`
# to prevent an infinit loop may happen.
# OCaml example: `let a = 'text;;`
def __init__(self, text):
self.text = text
# Current character and its position in text
self.current_pos = -1
self.current_char = None
self.current_token = None
# Generate the first character
self.advance()
def advance(self, nb=1):
"""
Update the current character:
- modify self.current_pos
- modify self.current_char
If there is no next character, self.current_char is set to None
Parameters
----------
nb : int, optional
Number of character to advance (default is 1)
"""
for _ in range(nb):
self.current_pos += 1
if self.current_pos < len(self.text):
self.current_char = self.text[self.current_pos]
else:
self.current_char = None
def peek(self, nb=1):
"""
Return the nb next characters from text
If there is no next character return None
Parameters
----------
nb : int, optional
Number of character to return (default is 1)
"""
if self.current_pos + nb > len(self.text):
return None
result = ''
for i in range(nb):
result += self.text[self.current_pos + i]
return result
def isEnd(self):
"""
Return true if the end of text is attained
"""
return self.current_char is None
def isNotEnd(self):
"""
Return not isEnd()
"""
return not self.isEnd()
def get_id(self):
"""
Return an id: an alphanumeric string forming a word
Accepted characters:
a-z, A-Z, 1-9, _
The function get_next_token will check the first character is a-Z, A-Z before calling this method
"""
result = ''
while self.isNotEnd() and (self.current_char.isalnum() or self.current_char == '_'):
result += self.current_char
self.advance()
return result
def get_number_token(self):
"""
Return an INT or a FLOAT token
"""
# Note the position of the begining of the number for error reporting
pos = self.current_pos
result = ''
while self.isNotEnd() and self.current_char.isdigit():
result += self.current_char
self.advance()
if self.isEnd() or self.current_char != '.':
return Token(INT, int(result), pos, len(result))
result += '.'
self.advance()
while self.isNotEnd() and self.current_char.isdigit():
result += self.current_char
self.advance()
return Token(FLOAT, float(result), pos, len(result))
def get_string_token(self):
"""
Return a string: a text delimited by two quotes or double quotes
Support character escapement
"""
# The delimiter can be a simple quote ' or a double quote "
delimiter = self.current_char
# Begin position of the string
pos = self.current_pos
# Pass the first delimiter
self.advance()
result = ''
# We are expecting a second delimiter
while self.isNotEnd() and self.current_char != delimiter:
# NOTE: currently \n and \t are not preserved
# Escaped character: \" \' ...
if self.current_char == '\\':
# We just pass the backslash
# then add the char even if it's the delimiter
self.advance()
result += self.current_char
self.advance()
# Make sure the remaining character is the delimiter
if self.current_char == delimiter:
# Pass the second delimiter
self.advance()
return Token(STRING, result, pos, len(result))
else:
# The code was entirely read but the string was not closed
errorsManager.SyntaxError("The string was not closed")
def pass_comment(self):
"""
Advance in the text while the current char is inside a comment
Nested comments are required to be each closed
"""
# Pass the '(*'
self.advance(2)
while self.isNotEnd() and self.peek(2) != '*)':
# Support nested comments
if self.peek(2) == '(*':
self.pass_comment()
self.advance()
# Make sure the remaining characters are a comment closer
if self.peek(2) == '*)':
# Pass the '*)'
self.advance(2)
else:
errorsManager.SyntaxError("The comment was not closed")
def get_next_token(self):
"""
Parse and return the next token found in text
Return EOF when all tokens have been parsed
Raise
-----
SyntaxError
If a not defined character is found
"""
current_char = self.current_char
current_pos = self.current_pos
if current_char is None:
return Token(EOF, None, -1, -1) # NOTE: we could use None instead of -1
# Skip whitespace, new lines and tabs
if self.current_char == ' ' or self.current_char == '\n' or self.current_char == '\t':
self.advance()
return self.get_next_token()
# Skip comments
if self.peek(2) == '(*':
self.pass_comment()
return self.get_next_token()
############### NUMBER ###############
if self.current_char.isdigit():
return self.get_number_token()
############### STRING ###############
if self.current_char in ('"', "'"):
return self.get_string_token()
########### ID or KEYWORDS ###########
if self.current_char.isalpha() and self.current_char.islower():
name = self.get_id()
length = len(name) # Length of the code
if name in RESERVED_KEYWORDS:
return Token(RESERVED_KEYWORDS[name], None, current_pos, len(name))
else:
return Token(ID, name, current_pos, length)
########### Two characters ###########
if self.peek(2) == '->':
self.advance(2)
return Token(ARROW, '->', current_pos, 2)
############### BINOP ###############
if self.peek(2) == '+.':
self.advance(2)
return Token(PLUS_FLOAT, '+.', current_pos, 2)
if current_char == '+':
self.advance()
return Token(PLUS_INT, '+', current_pos, 1)
if self.peek(2) == '-.':
self.advance(2)
return Token(MINUS_FLOAT, '-.', current_pos, 2)
if current_char == '-':
self.advance()
return Token(MINUS_INT, '-', current_pos, 1)
if self.peek(2) == '*.':
self.advance(2)
return Token(MUL_FLOAT, '*.', current_pos, 2)
if current_char == '*':
self.advance()
return Token(MUL_INT, '*', current_pos, 1)
if self.peek(2) == '/.':
self.advance(2)
return Token(DIV_FLOAT, '/.', current_pos, 2)
if current_char == '/':
self.advance()
return Token(DIV_INT, '/', current_pos, 1)
############### Micelianous ###############
if current_char == '(':
self.advance()
return Token(LPAREN, '(', current_pos, 1)
if current_char == ')':
self.advance()
return Token(RPAREN, ')', current_pos, 1)
if current_char == ';':
self.advance()
return Token(SEMI, ';', current_pos, 1)
if self.peek(2) == ':=':
self.advance(2)
return Token(REASSIGN, ':=', current_pos, 2)
if self.peek(2) == '<>':
self.advance(2)
return Token(DIFFERENT, '<>', current_pos, 2)
if current_char == '=':
self.advance()
return Token(EQUALS, '=', current_pos, 1)
if current_char == '!':
self.advance()
return Token(EXCLAMATION, '!', current_pos, 1)
if self.peek(2) == '&&':
self.advance(2)
return Token(BOOLEANCONJUNCTION, '&&', current_pos, 2)
if self.peek(2) == '||':
self.advance(2)
return Token(BOOLEANDISJUNCTION, '||', current_pos, 2)
errorsManager.SyntaxError("Invalid character")