-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexer.py
195 lines (158 loc) · 5.67 KB
/
lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
from enum import Enum, auto
class LexException(Exception): pass
class TOKENS(Enum):
OP = auto()
NUM = auto()
STR = auto()
IDENT = auto()
PARENS = auto()
KEYWORD = auto()
NATIVE = auto()
NONE = auto()
NL = auto()
EOF = auto()
# List of all valid operators
OPS = ['+', '-', '*', '/', '!', ';', '=', "...", ":=", '<', '[', "++", "--", "==", ">=", "||", "<=", "in"]
PARENS = ['(', ')', '{', '}', ']', ',']
# List of special keywords
KEYWORDS = ["if", "else", "MACRO", "loop"]
# Returns whether a character forms part of a number
def in_num(c):
return c.isnumeric() or c == '.'
# Returns whether a character can be used in identifiers
def in_ident(c):
return c.isalpha() or c == '_'
# Keeps track of tokens that have been peeked
__previous_token = None
# Basic heuristics to check if we're inside a macro or not
# This allows different syntax in the macro whilst still capturing native code properly
__in_macro = False
__block_depth = 0
# The macro char is used to denote the difference between native code and
# macro code within a macro. This can be set at the beginning of a file so the most
# suitable (producing least conflict) character can be chosen.
# TODO
MACRO_CHAR = '$'
# Returns next token in lexer stream
# Returns and erases previous token if the token was peeked
def get_next(lexer):
global __previous_token
if (__previous_token != None):
temp = __previous_token
__previous_token = None
return temp
return next(lexer)
# Throws away next token
def consume_next(lexer):
global __previous_token
if (__previous_token != None):
__previous_token = None
else:
next(lexer)
# Peeks at, but does not consume the next token in the stream
def peek_next(lexer):
global __previous_token
if (__previous_token == None):
__previous_token = next(lexer)
return __previous_token
def in_ops(s, ops):
return list(filter(lambda op: s in op, ops))
# Turns a text file into a generator for a stream of tokens
# The tokens consist of 4 basic types:
# 1. OP - operators as defined by the OPS list
# 2. NUM - numbers, represented as floating point
# 3. STR - string literals
# 4. IDENT - identifiers that may represent variable names
# EOF token is given to signify the end of the stream.
def lex(file, in_macro = None):
global __in_macro, __block_depth
# Little work around for parsing StringIO sort of things
if (in_macro != None): __in_macro = True
while (char := file.read(1)):
# Check to see if this is the beginning of macro def, otherwise consume as native line
if (not __in_macro):
initial_whitespace = ""
while (char.isspace()):
initial_whitespace += char
char = file.read(1)
macro_check = char + file.read(4)
if (macro_check == "MACRO"):
__in_macro = True
yield (TOKENS.KEYWORD, macro_check)
continue
yield (TOKENS.NATIVE, initial_whitespace + macro_check + file.readline())
continue
# if (char == '\n'): yield (TOKENS.NL, '\n')
if (char.isspace()): continue
# Basic python-style comments that just skip over the line
if (char == '#'):
file.readline()
continue
if (ops := in_ops(char, OPS)):
s = char
# Continue consuming characters until we filter the possible
# operators. We reach 0 when we had the correct token and overconsume.
while (len(ops) > 0):
s += file.read(1)
ops = in_ops(s, ops)
# To compensate for the overconsumptions, we rollback
file.seek(file.tell() - 1, 0)
s = s[:-1]
if (s in OPS):
yield (TOKENS.OP, s)
continue
# Incases of multicharacter tokens, the operator could start right, but be interrupted,
# consider: .a., which would be recognized as being only [...] on the first loop,
# when the lexer reads 'a' it then reaches 0 possible tokens and exists, thus believing it has
# parsed all of the ... operator. However it has actually only parsed '.', an invalid operator.
# If s is not found in OPS we need to then rollack and let it fall through to the rest of the lexer
file.seek(file.tell() - len(s) + 1, 0)
if (char == MACRO_CHAR):
yield (TOKENS.NATIVE, file.readline())
elif (char in PARENS):
if (char == '{'):
__block_depth += 1
elif (char == '}'):
__block_depth -= 1
__in_macro = __block_depth > 0
yield (TOKENS.PARENS, char)
# String can only be delimited by " " at the moment and no proper
# escaping is provided :(
elif (char == '"'):
string = ""
# WARNING, if there is no closing ", the program lexer will hang, can be mitigated latter by checking for EOF
while ((char := file.read(1)) != '"'):
string += char
yield (TOKENS.STR, string)
elif (in_num(char)):
number = char if char != '.' else '0.'
while (in_num(char := file.read(1))):
number += char
# Move back one character to account for the ending condition of while loop
if (char != ''): file.seek(file.tell() - 1, 0)
yield (TOKENS.NUM, float(number))
elif (in_ident(char)):
ident = char
while (in_ident(char := file.read(1))):
ident += char
# Move back one character to account for the ending condition of while loop
if (char != ''): file.seek(file.tell() - 1, 0)
if (ident in KEYWORDS): yield (TOKENS.KEYWORD, ident)
elif (ident == "none"): yield(TOKENS.NONE, None)
else: yield (TOKENS.IDENT, ident)
else:
raise LexException(f"Character cannot be recognized in token: {char}")
file.close()
yield (TOKENS.EOF, None)
# Need to reset the lexer state properly
def new_lex(file, in_macro = None):
global __previous_token
__previous_token = None
return lex(file, in_macro = in_macro)
def lex_file(file_name):
file = open(file_name, "r")
return new_lex(file)
# Small test for lexing, very coolio
if (__name__ == "__main__"):
for token in lex_file("examples/basic.pre"):
print(token)