-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstruct_parser.py
330 lines (299 loc) · 13.4 KB
/
struct_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
import re
import sys
import json
import pyPEG
from pyPEG import keyword
try:
from collections import OrderedDict
except:
from ordereddict import OrderedDict
# A simplified pascal unit grammar, specialized in parsing Inno Setup Struct.pas files
def unit(): return unit_head, unit_interface, _unit_implementation, _unit_block, '.'
def unit_head(): return keyword('unit'), identifier, ';'
def unit_interface(): return keyword('interface'), 0, uses_clause, -1, _decl_section
def _unit_implementation(): return keyword('implementation'), 0, uses_clause, -1, _decl_section
def _unit_block(): return keyword('end')
def uses_clause(): return keyword('uses'), _ident_list, ';'
def _decl_section(): return [const_section, type_section]
def const_section(): return keyword('const'), -2, const_declaration
def const_declaration(): return identifier, 0, (':', _type_decl), '=', _const_expression, ';'
def type_section(): return keyword('type'), -2, type_declaration
def type_declaration(): return identifier, '=', _type_decl, ';'
def _type_decl(): return [_struc_type, pointer_type, _variant_type, _simple_type, (keyword('type'), type_id)]
def _struc_type(): return 0, keyword('packed'), [array_type, set_type, record_decl]
def array_type(): return keyword('array'), 0, ('[', 0, array_index, -1, (',', array_index), ']'), keyword('of'), array_subtype
def array_index(): return [identifier, (_const_expression, '..', _const_expression)]
def array_subtype(): return [keyword('const'), _type_decl]
def set_type(): return keyword('set'), keyword('of'), _type_decl
def pointer_type(): return '^', _type_decl
def _variant_type(): return type_id
def _simple_type(): return [type_id, subrange_type, enum_type]
def subrange_type(): return _const_expression, 0, ('..', _const_expression)
def enum_type(): return '(', identifier, 0, ('=', expression), -1, (',', identifier, 0, ('=', expression)), ')'
def record_decl(): return keyword('record'), -1, record_field, -1, record_item, keyword('end')
def record_item(): return [const_section, type_section, record_field]
def record_field(): return _ident_list, ':', _type_decl, ';'
def expression(): return _term, -1, (add_op, _term)
def _term(): return _factor, -1, (mul_op, _factor)
def _factor(): return [('not', _factor), ('+', _factor), ('-', _factor), ('^', identifier), int_num, real_num, hex_num, asm_hex_num, 'true', 'false', 'nil', ('(', expression, ')'), _string_factor, set_section, (identifier, '(', expression, ')')]
def _string_factor(): return -2, [quoted_string, control_string]
def set_section(): return '[', 0, (expression, -1, ([',', '..'], expression)), ']'
def colon_construct(): return ':', expression, 0, (':', expression)
def _const_expression(): return [expression, ('(', _const_expression, -1, (',', _const_expression), ')')]
def _ident_list(): return identifier, -1, (',', identifier)
def add_op(): return re.compile(r'\+|\-|or|xor')
def mul_op(): return re.compile(r'\*|\/|div|mod|and|shl|shr|as')
def identifier(): return re.compile(r'[a-zA-Z_]\w*')
def type_id(): return re.compile(r'[a-zA-Z_]\w*')
def int_num(): return re.compile(r'\d+')
def real_num(): return re.compile(r'd+(\.\d*)?([eE]([+-])?d+)?')
def hex_num(): return re.compile(r'\$[a-fA-F\d]+')
def asm_hex_num(): return re.compile(r'[a-fA-F\d]+[hH]?')
def quoted_string(): return re.compile(r"'[^'\n]*'")
def control_string(): return re.compile(r'(\#\d+)|(\#\$[a-fA-F\d]+)')
def comment(): return re.compile(r'{.*?}', re.S)
def parse(fileinput_files, trace=False):
pyPEG.print_trace = trace
return pyPEG.parse(unit(), fileinput_files, skipWS=True, skipComments=comment)
def pyast_to_dict(ast):
ast_dict = OrderedDict()
def add_or_append(d, key, val):
if key in d:
if type(d[key]) == list:
d[key].append(val)
else:
d[key] = [d[key], val]
else:
d[key] = val
def todict(l, d):
if type(l) == list:
for i in l:
todict(i, d)
elif type(l) == pyPEG.Symbol:
key = l.__name__
if isinstance(l.what, basestring):
add_or_append(d, key, l.what)
else:
nd = OrderedDict()
add_or_append(d, key, nd)
todict(l.what, d=nd)
todict(ast, ast_dict)
return ast_dict
def pyast_to_json(ast, raw_dump=False):
pyast_dict = pyast_to_dict(ast)
if raw_dump:
return json.dumps(pyast_dict)
formatter = StructFormatter(pyast_dict)
return json.dumps(formatter.format(), indent=4, separators=(',', ': '))
class StructFormatter(object):
"""
Format pyAST to easily parseable dictionary collection.
Note: for structures that contain strings interpret its size as
minimum size, not actual, since strings are variable in length.
"""
def __init__(self, pyast_dict):
self.pyast_dict = pyast_dict
self._output = OrderedDict()
# size definitions
type_sizes = {
# integer types
'byte': 1, 'boolean': 1, 'shortint': 1,
'smallint': 2, 'word': 2,
'integer': 4, 'cardinal': 4, 'longint': 4, 'longword': 4, 'dword': 4,
'integer64': 8, 'qword': 8,
# real types
'real': 4, 'single': 4,
'double': 8, 'extended': 10,
# string types
'char': 1, 'ansichar': 1, 'widechar': 2,
'string': [4, 1], # has an integer header that stores its length
'ansistring': [4, 1], 'widestring': [4, 2],
}
type_values = {
'boolean': 2,
'byte': 256, 'char': 256, 'ansichar': 256,
'word': 65536,
}
def _array_type(self, array_dict):
"""Return (subtype_dict, count, size) of array"""
index = array_dict['array_index']
if 'identifier' in index:
indextype_id = index['identifier']
type_data = self._output.get(indextype_id)
if type_data:
count = type_data.get('count') or self.type_values.get(type_data['type'])
else:
count = self.type_values[indextype_id.lower()]
elif 'expression' in index:
assert(len(index['expression']) > 1)
i0 = int(index['expression'][0]['int_num'])
i1 = int(index['expression'][1]['int_num'])
count = i1 - i0 + 1
subtype_data = self._format_types(array_dict['array_subtype'])
if subtype_data:
subtype = subtype_data
subtype_size = subtype_data['size']
else:
subtype_id = array_dict['array_subtype']['type_id']
subtype_size = self.type_sizes[subtype_id.lower()]
subtype = subtype_id.lower()
# ignore string size since it is variable
size = (count * subtype_size) if not isinstance(subtype_size, list) else None
return subtype, count, size
def _enum_type(self, enum_dict):
"""Return (count, size) of enum"""
if isinstance(enum_dict['identifier'], list):
count = len(enum_dict['identifier'])
size = 1 if count < 256 else 2
return count, size
return 1, 1
def _set_type(self, set_dict):
"""Return (subtype_dict, size) of set"""
subtype = self._format_types(set_dict)
size = subtype.get('count') or self.type_values.get(subtype['type']) - 1
# Set size is calculated as such:
# (Max div 8) - (Min div 8) + 1
# Min is usually 0, and Max the number of elements
size = (size / 8) + 1
return subtype, size
def _record_type(self, record_dict):
"""Return (fields_list, size) of record"""
if isinstance(record_dict['record_field'], dict):
record_dict['record_field'] = [record_dict['record_field']]
size = 0
fields = OrderedDict()
for field in record_dict['record_field']:
name = field['identifier']
t = self._format_types(field)
if t:
if not isinstance(name, list):
name = [name]
for n in name:
fields[n] = t
try:
size += t['size'] if not isinstance(t['size'], list) else t['size'][0]
except Exception:
sys.stderr.write('could not calculate field size for type: %s\n' % n)
return fields, size
def _format_types(self, type_dict):
"""Type declaration formatter"""
td = type_dict
t = {}
if 'type_id' in td:
type_id = td['type_id']
type_data = self._output.get(type_id)
if type_data:
t = type_data
else:
t['size'] = self.type_sizes.get(type_id.lower())
t['type'] = type_id if not t['size'] else type_id.lower()
elif 'array_type' in td:
t['type'] = u'array'
t['subtype'], t['count'], t['size'] = self._array_type(td['array_type'])
elif 'enum_type' in td:
t['type'] = u'enum'
t['count'], t['size'] = self._enum_type(td['enum_type'])
elif 'set_type' in td:
t['type'] = u'set'
t['subtype'], t['size'] = self._set_type(td['set_type'])
elif 'record_decl' in td:
t['type'] = u'record'
t['fields'], t['size'] = self._record_type(td['record_decl'])
elif 'pointer_type' in td:
pass # ignore pointer types
elif 'expression' in td:
pass # ignore const declarations
else:
sys.stderr.write('could not format type: %s\n' % type_dict)
return t
def _format_values(self, const_expression):
"""Const declaration formatter"""
e = const_expression
v = None
if isinstance(e, list):
v = []
for item in e:
v.append(self._format_values(item))
if 'expression' in e:
v = self._format_values(e['expression'])
if 'quoted_string' in e or 'control_string' in e:
v = ''
for k,s in e.items():
if '_string' not in k:
continue
if not isinstance(s, list):
s = [s]
unquote = (lambda s: s[1:-1]) if k == 'quoted_string' else (lambda s: s)
v += str.join('', map(unquote, s)) # assumes string concatenation
elif 'add_op' in e or 'mul_op' in e:
v = self._reduce_values(e, v)
elif 'int_num' in e:
v = int(e['int_num'])
elif 'hex_num' in e:
v = e['hex_num']
elif v is None:
sys.stderr.write('could not format const: %s\n' % const_expression)
return v
def _reduce_values(self, const_expression, current_value):
e = const_expression
v = current_value
if v:
# assumptions here:
# - `e[0]` is an expression
# - `e[1]` is an operator
# - `e[2]` is an operand
# - `v` is output of `e[0]` expression
# - `v` is same type as `e[2]`
operators, operands = e.items()[1:]
if not isinstance(v, list):
v = [v]
else:
operands, operators = e.items()
operand_type, operands = operands
operator_type, operators = operators
if not isinstance(operands, list):
operands = [operands]
if not isinstance(operators, list):
operators = [operators]
if v:
operands = v + operands
if operand_type == 'int_num':
operands = map(int, operands)
else:
sys.stderr.write('unprocessed operand type: %s\n' % operand_type)
if operator_type in ['mul_op', 'add_op']:
v = operands[0]
for i,o in zip(operands[1:], operators):
if o == 'shl':
v = v << i
elif o == '+':
v += i
else:
sys.stderr.write('uhandled operation: %s %s %s\n' % (v, o, i))
else:
sys.stderr.write('uprocessed operation type: %s\n' % operator_type)
return v
def format(self):
for ts in self.pyast_dict['unit_interface']['type_section']:
if isinstance(ts['type_declaration'], dict):
ts['type_declaration'] = [ts['type_declaration']]
for td in ts['type_declaration']:
name = td['identifier']
t = self._format_types(td)
if t:
self._output[name] = t
for cs in self.pyast_dict['unit_interface']['const_section']:
if isinstance(cs['const_declaration'], dict):
cs['const_declaration'] = [cs['const_declaration']]
for cd in cs['const_declaration']:
name = cd['identifier']
c = self._format_types(cd)
c['value'] = self._format_values(cd['expression'])
self._output[name] = c
unit_dict = {self.pyast_dict['unit_head']['identifier']: self._output}
return unit_dict
if __name__ == '__main__':
import fileinput
files = fileinput.input()
result = parse(files)
print pyast_to_json(result)