From 8d8bda8fa9a0ff5ac590dd47daa85e8a2ccb88a5 Mon Sep 17 00:00:00 2001 From: Eduardo Silva Date: Thu, 5 Dec 2024 17:39:33 -0600 Subject: [PATCH] tests: internal: data: pack: generators: fix json encoding ascii utf8 Signed-off-by: Eduardo Silva --- tests/internal/data/pack/mixed.py | 26 +++++++------ tests/internal/data/pack/mixed_002.json | 2 +- tests/internal/data/pack/mixed_002.mp | Bin 37 -> 36 bytes tests/internal/data/pack/mixed_003.json | 2 +- tests/internal/data/pack/utf8_bell.json | 2 +- tests/internal/data/pack/utf8_copyright.json | 2 +- tests/internal/data/pack/utf8_gen.py | 37 +++++++++++-------- tests/internal/data/pack/utf8_hokke.json | 2 +- tests/internal/data/pack/utf8_relaxed.json | 2 +- 9 files changed, 42 insertions(+), 33 deletions(-) diff --git a/tests/internal/data/pack/mixed.py b/tests/internal/data/pack/mixed.py index 7ad0a6149b6..c8859231d98 100644 --- a/tests/internal/data/pack/mixed.py +++ b/tests/internal/data/pack/mixed.py @@ -6,22 +6,24 @@ import json import msgpack + def gen_json(f): - raw = open(f, 'r') - data = raw.read() - raw.close() + # Open the input file in text mode with UTF-8 encoding + with open(f, 'r', encoding='utf-8') as raw: + data = raw.read() - out_mp = f[:-4] + ".mp" - out_json = f[:-4] + ".json" + # Define output filenames + base_name = os.path.splitext(f)[0] + out_mp = base_name + ".mp" + out_json = base_name + ".json" - # Write messagepack - fmp = open(out_mp, 'w') - fmp.write(msgpack.packb(data)) - fmp.close() + # Write MessagePack-encoded data in binary mode + with open(out_mp, 'wb') as fmp: + fmp.write(msgpack.packb(data)) - fjson = open(out_json, 'w') - fjson.write(json.dumps(data)) - fjson.close() + # Write JSON-encoded data in text mode + with open(out_json, 'w', encoding='utf-8') as fjson: + fjson.write(json.dumps(data)) for fn in os.listdir('.'): if not os.path.isfile(fn): diff --git a/tests/internal/data/pack/mixed_002.json b/tests/internal/data/pack/mixed_002.json index e32314b0aa9..d1afe50ace3 100644 --- a/tests/internal/data/pack/mixed_002.json +++ b/tests/internal/data/pack/mixed_002.json @@ -1 +1 @@ -"mixed_002 =>\n\n áéíóú\n\n\n'\n\\t\n" \ No newline at end of file +"mixed_002 =>\n\n \u00e1\u00e9\u00ed\u00f3\u00fa\n\n\n'\n\\t\n" \ No newline at end of file diff --git a/tests/internal/data/pack/mixed_002.mp b/tests/internal/data/pack/mixed_002.mp index 1bf975535a1c3e87759e44027bf688f83f835be2..4938203d59a1b150b84400a5fa60eb0dcbff5798 100644 GIT binary patch delta 7 OcmY#YVZ1q!Nf`hKB>~z1 delta 8 PcmY#UWxB;Mkx>}{2vGss diff --git a/tests/internal/data/pack/mixed_003.json b/tests/internal/data/pack/mixed_003.json index 167c89b8a06..126945cb3c7 100644 --- a/tests/internal/data/pack/mixed_003.json +++ b/tests/internal/data/pack/mixed_003.json @@ -1 +1 @@ -"á\n" \ No newline at end of file +"\u00e1\n" \ No newline at end of file diff --git a/tests/internal/data/pack/utf8_bell.json b/tests/internal/data/pack/utf8_bell.json index ced4da0cfe5..d0730c4a7ce 100644 --- a/tests/internal/data/pack/utf8_bell.json +++ b/tests/internal/data/pack/utf8_bell.json @@ -1 +1 @@ -"🔔" \ No newline at end of file +"\ud83d\udd14" \ No newline at end of file diff --git a/tests/internal/data/pack/utf8_copyright.json b/tests/internal/data/pack/utf8_copyright.json index 4d52a66f3d0..92d937cf7bd 100644 --- a/tests/internal/data/pack/utf8_copyright.json +++ b/tests/internal/data/pack/utf8_copyright.json @@ -1 +1 @@ -"©" \ No newline at end of file +"\u00a9" \ No newline at end of file diff --git a/tests/internal/data/pack/utf8_gen.py b/tests/internal/data/pack/utf8_gen.py index 606e8cc2d31..9f1eef2b9c2 100644 --- a/tests/internal/data/pack/utf8_gen.py +++ b/tests/internal/data/pack/utf8_gen.py @@ -6,27 +6,34 @@ import msgpack def gen_json(f): + print(f) - print f - - with io.open(f, 'rb') as raw: + with open(f, 'rb') as raw: data = raw.read() - out_mp = f[:-4] + ".mp" - out_json = f[:-4] + ".json" + out_mp = f"{os.path.splitext(f)[0]}.mp" + out_json = f"{os.path.splitext(f)[0]}.json" + + # Decode input bytes to a string + try: + decoded_data = data.decode('utf-8') + except UnicodeDecodeError as e: + print(f"Error: Unable to decode file {f} as UTF-8: {e}") + return # Write messagepack - fmp = open(out_mp, 'w') - fmp.write(msgpack.packb(data)) - fmp.close() + with open(out_mp, 'wb') as fmp: + fmp.write(msgpack.packb(decoded_data)) - fjson = open(out_json, 'w') - fjson.write(json.dumps(data).encode('utf8')) - fjson.close() + # Write JSON with properly encoded Unicode escape sequences + with open(out_json, 'w', encoding='utf-8') as fjson: + # Use json.dumps with ensure_ascii=True for \uXXXX escape sequences + escaped_data = json.dumps(decoded_data, ensure_ascii=True) + fjson.write(escaped_data) for fn in os.listdir('.'): - if not os.path.isfile(fn): - continue + if not os.path.isfile(fn): + continue - if fn.startswith('utf8_') and fn.endswith('.txt'): - gen_json(fn) + if fn.startswith('utf8_') and fn.endswith('.txt'): + gen_json(fn) diff --git a/tests/internal/data/pack/utf8_hokke.json b/tests/internal/data/pack/utf8_hokke.json index d93624bf21f..37f460c4b35 100644 --- a/tests/internal/data/pack/utf8_hokke.json +++ b/tests/internal/data/pack/utf8_hokke.json @@ -1 +1 @@ -"𩸽" \ No newline at end of file +"\ud867\ude3d" \ No newline at end of file diff --git a/tests/internal/data/pack/utf8_relaxed.json b/tests/internal/data/pack/utf8_relaxed.json index 4526bf40faf..2402faf9df4 100644 --- a/tests/internal/data/pack/utf8_relaxed.json +++ b/tests/internal/data/pack/utf8_relaxed.json @@ -1 +1 @@ -"☺" \ No newline at end of file +"\u263a" \ No newline at end of file