-
Notifications
You must be signed in to change notification settings - Fork 1
/
flatten_json.py
419 lines (349 loc) · 16.1 KB
/
flatten_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import json
try:
# 3.8 and up
from collections.abc import Iterable
except ImportError:
from collections import Iterable
from util import check_if_numbers_are_consecutive
import six
import copy
import re
from math import isnan
def _construct_key(previous_key, separator, new_key):
"""
Returns the new_key if no previous key exists, otherwise concatenates
previous key, separator, and new_key
:param previous_key:
:param separator:
:param new_key:
:return: a string if previous_key exists and simply passes through the
new_key otherwise
"""
if previous_key:
return u"{}{}{}".format(previous_key, separator, new_key)
else:
return new_key
def flatten(nested_dict,
separator="_",
root_keys_to_ignore=set(),
ignore_key_func=None):
"""
Flattens a dictionary with nested structure to a dictionary with no
hierarchy
Consider ignoring keys that you are not interested in to prevent
unnecessary processing
This is specially true for very deep objects
:param nested_dict: dictionary we want to flatten
:param separator: string to separate dictionary keys by
:param root_keys_to_ignore: set of root keys to ignore from flattening
:param ignore_key_func: func to ignore key at any level from flattening
:return: flattened dictionary
"""
assert isinstance(nested_dict, dict), "flatten requires a dictionary input"
assert isinstance(separator, six.string_types), "separator must be string"
# This global dictionary stores the flattened keys and values and is
# ultimately returned
flattened_dict = dict()
def _flatten(object_, key):
"""
For dict, list and set objects_ calls itself on the elements and for
other types assigns the object_ to
the corresponding key in the global flattened_dict
:param object_: object to flatten
:param key: carries the concatenated key for the object_
:return: None
"""
# Empty object can't be iterated, take as is
if not object_:
flattened_dict[key] = object_
# These object types support iteration
elif isinstance(object_, dict):
for object_key in object_:
if not key and object_key in root_keys_to_ignore:
continue
if ignore_key_func and ignore_key_func(object_key):
continue
_flatten(object_[object_key], _construct_key(key,
separator,
object_key))
elif isinstance(object_, (list, set, tuple)):
for index, item in enumerate(object_):
_flatten(item, _construct_key(key, separator, index))
# Anything left take as is
else:
flattened_dict[key] = object_
_flatten(nested_dict, None)
return flattened_dict
flatten_json = flatten
def flatten_preserve_lists(nested_dict, separator="_",
root_keys_to_ignore=set(),
max_list_index=3, max_depth=3):
"""
Flattens a dictionary with nested structure to a dictionary with no
hierarchy
Consider ignoring keys that you are not interested in to prevent
unnecessary processing
This is specially true for very deep objects
This preserves list structure, and
you can specify max_list_index and max_depth to limit processing
Child elements with only one value inside
will be unwrapped and become parent's value.
:param nested_dict: dictionary we want to flatten
:param separator: string to separate dictionary keys by
:param root_keys_to_ignore: set of root keys to ignore from flattening
:param max_list_index: maximum list index to process
:param max_depth: maximum nesting depth to process
:return: flattened dictionary
"""
assert isinstance(nested_dict, dict), "flatten requires a dictionary input"
assert isinstance(separator, six.string_types), \
"separator must be a string"
# This global dictionary stores the flattened keys and values and is
# ultimately returned
flattened_dict = dict()
def _flatten(object_, key):
"""
For dict, list and set objects_ calls itself on the elements and for
other types assigns the object_ to
the corresponding key in the global flattened_dict
:param object_: object to flatten
:param key: carries the concatenated key for the object_
:return: None
"""
# Empty object can't be iterated, take as is
if not object_:
flattened_dict[key] = object_
# These object types support iteration
# dict always go into columns
elif isinstance(object_, dict):
first_key = list(object_.keys())[0]
# if only 1 child value, and child value not a dict or list
# flatten immediately
if len(object_) == 1 \
and not (isinstance(object_[first_key], dict)
or isinstance(object_[first_key], list)
):
flattened_dict[key] = object_[first_key]
else:
for object_key in object_:
if not (not key and object_key in root_keys_to_ignore):
_flatten(object_[object_key],
_construct_key(key, separator, object_key)
)
elif isinstance(object_, list) or isinstance(object_, set):
for index, item in enumerate(object_):
_flatten(item, _construct_key(key, separator, index))
else:
flattened_dict[key] = object_
def _flatten_low_entropy(object_, key, cur_depth, max_depth_inner):
"""
For dict, list and set objects_ calls itself on the elements and for
other types assigns the object_ to
the corresponding key in the global flattened_dict
:param object_: object to flatten
:param key: carries the concatenated key for the object_
:return: None
"""
cur_depth = cur_depth + 1 # increase current_depth
debug = 0
# write latest child as value if max_depth exceeded
if cur_depth > max_depth_inner:
global_max_record = int(max(list(
list_prebuilt_flattened_dict.keys())))
for d in list_prebuilt_flattened_dict[str(global_max_record)]:
d[key] = object_
else:
# Empty object can't be iterated, take as is
if not object_:
global_max_record = int(max(list(
list_prebuilt_flattened_dict.keys())))
for d in list_prebuilt_flattened_dict[str(global_max_record)]:
d[key] = object_
# These object types support iteration
# dict always go into columns
elif isinstance(object_, dict):
first_key = list(object_.keys())[0]
# if only 1 child value, and child value
# not a dict or list, flatten immediately
if len(object_) == 1 \
and not (isinstance(object_[first_key], dict)
or isinstance(object_[first_key], list)):
global_max_record = int(max(list(
list_prebuilt_flattened_dict.keys())))
for d in list_prebuilt_flattened_dict[
str(global_max_record)
]:
d[key] = object_[first_key]
else:
for object_key, val in \
sorted(object_.items(),
key=lambda x:
(str(type(x[1])), len(str(x[1]))),
reverse=False):
if not (not key and object_key in root_keys_to_ignore):
_flatten_low_entropy(object_[object_key],
_construct_key(key,
separator,
object_key),
cur_depth, max_depth_inner)
# lists could go into rows, like in a relational database
elif isinstance(object_, list) or isinstance(object_, set):
if debug:
print("\nparent key of list:",
key, "| length: ",
str(len(object_)))
# need to remember global list state when we entered
# this recursion
global_max_record_start = int(max(list(
list_prebuilt_flattened_dict.keys())))
entry = copy.deepcopy(list_prebuilt_flattened_dict[
str(global_max_record_start)
])
for index, item in enumerate(object_):
if debug:
print(" list key:", key,
" index: " + str(index), "vals: ", item)
sub = -1
if isinstance(item, dict):
first_value = list(item.values())[0]
if isinstance(first_value, float):
sub = first_value
if not isnan(sub) and index < max_list_index:
# start from second element, 1st element is like column
if index > 0:
global_max_record = int(max(list(
list_prebuilt_flattened_dict.keys())))
list_prebuilt_flattened_dict[
str(global_max_record + 1)
] = copy.deepcopy(entry)
_flatten_low_entropy(item, key, cur_depth,
max_depth_inner)
else:
pass
list_prebuilt_flattened_dict['0'] = \
[subel for k, v in
sorted(list_prebuilt_flattened_dict.items())
for idx, subel in enumerate(v)]
for key in list(sorted(list_prebuilt_flattened_dict.keys())):
if key != '0':
del list_prebuilt_flattened_dict[key]
if debug:
print("collapsed global list")
# Anything left take as is, assuming you hit the end of the line.
else:
# in this case, there may be
# a list of prebuilt_flattened_dict by now
# so need to update them all.
global_max_record = int(max(list(
list_prebuilt_flattened_dict.keys())))
for d in list_prebuilt_flattened_dict[str(global_max_record)]:
d[key] = object_
# decrease depth counter
cur_depth -= 1
_flatten(nested_dict, None)
# get unique column names, without the integers
# TODO: potential issue: what if column names have digits naturally?
reskeys = list(flattened_dict.keys())
unique_integers = list(set([separator + char for key
in reskeys for char in key if char.isdigit()]))
regex = '|'.join(unique_integers)
regex += "|" + regex.replace(".", "")
unique_columns = list(set([re.sub("(" + regex + ")", "", key)
for key in reskeys]))
# create global dict, now with unique column names
prebuilt_flattened_dict = {column: None for column in unique_columns}
# initialize global record list
list_prebuilt_flattened_dict = {'0': [prebuilt_flattened_dict]}
_flatten_low_entropy(nested_dict, None, cur_depth=0,
max_depth_inner=max_depth)
return list_prebuilt_flattened_dict['0']
def _unflatten_asserts(flat_dict, separator):
assert isinstance(flat_dict, dict), "un_flatten requires dictionary input"
assert isinstance(separator, six.string_types), "separator must be string"
assert all((not value or not isinstance(value, Iterable) or
isinstance(value, six.string_types)
for value in flat_dict.values())), "provided dict is not flat"
def unflatten(flat_dict, separator='_'):
"""
Creates a hierarchical dictionary from a flattened dictionary
Assumes no lists are present
:param flat_dict: a dictionary with no hierarchy
:param separator: a string that separates keys
:return: a dictionary with hierarchy
"""
_unflatten_asserts(flat_dict, separator)
# This global dictionary is mutated and returned
unflattened_dict = dict()
def _unflatten(dic, keys, value):
for key in keys[:-1]:
dic = dic.setdefault(key, {})
dic[keys[-1]] = value
list_keys = sorted(flat_dict.keys())
for i, item in enumerate(list_keys):
if i != len(list_keys) - 1:
split_key = item.split(separator)
next_split_key = list_keys[i + 1].split(separator)
if not split_key == next_split_key[:-1]:
_unflatten(unflattened_dict, item.split(separator),
flat_dict[item])
else:
pass # if key contained in next key, json will be invalid.
else:
# last element
_unflatten(unflattened_dict, item.split(separator),
flat_dict[item])
return unflattened_dict
def unflatten_list(flat_dict, separator='_'):
"""
Unflattens a dictionary, first assuming no lists exist and then tries to
identify lists and replaces them
This is probably not very efficient and has not been tested extensively
Feel free to add test cases or rewrite the logic
Issues that stand out to me:
- Sorting all the keys in the dictionary, which specially for the root
dictionary can be a lot of keys
- Checking that numbers are consecutive is O(N) in number of keys
:param flat_dict: dictionary with no hierarchy
:param separator: a string that separates keys
:return: a dictionary with hierarchy
"""
_unflatten_asserts(flat_dict, separator)
# First unflatten the dictionary assuming no lists exist
unflattened_dict = unflatten(flat_dict, separator)
def _convert_dict_to_list(object_, parent_object, parent_object_key):
if isinstance(object_, dict):
for key in object_:
if isinstance(object_[key], dict):
_convert_dict_to_list(object_[key], object_, key)
try:
keys = [int(key) for key in object_]
keys.sort()
except (ValueError, TypeError):
keys = []
keys_len = len(keys)
if (keys_len > 0 and sum(keys) ==
int(((keys_len - 1) * keys_len) / 2) and keys[0] == 0 and
keys[-1] == keys_len - 1 and
check_if_numbers_are_consecutive(keys)):
# The dictionary looks like a list so we're going to replace it
parent_object[parent_object_key] = []
for key_index, key in enumerate(keys):
parent_object[parent_object_key].append(object_[str(key)])
# The list item we just added might be a list itself
# https://github.com/amirziai/flatten/issues/15
_convert_dict_to_list(parent_object[parent_object_key][-1],
parent_object[parent_object_key],
key_index)
_convert_dict_to_list(unflattened_dict, None, None)
return unflattened_dict
def cli(input_stream=sys.stdin, output_stream=sys.stdout):
raw = input_stream.read()
input_json = json.loads(raw)
output = json.dumps(flatten(input_json))
output_stream.write('{}\n'.format(output))
output_stream.flush()
if __name__ == '__main__':
cli()