From a44c3dcf9cd39691689c2c51a1ee37660ff4a7a3 Mon Sep 17 00:00:00 2001 From: Kevin Yan Date: Fri, 10 Jan 2025 01:28:25 +0000 Subject: [PATCH] Fix unexpected reader_type kwarg error --- .../tests/utils/data/test_file.parquet.gz | Bin 0 -> 630 bytes deltacat/tests/utils/test_pyarrow.py | 23 ++++++++++++++++++ deltacat/utils/pyarrow.py | 4 +-- 3 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 deltacat/tests/utils/data/test_file.parquet.gz diff --git a/deltacat/tests/utils/data/test_file.parquet.gz b/deltacat/tests/utils/data/test_file.parquet.gz new file mode 100644 index 0000000000000000000000000000000000000000..673dc0146e0a009b550e0ff9d0a64909fa582db0 GIT binary patch literal 630 zcmV-+0*U<}iwFoJ#_(hS19W9`bYEs^Y-KKRVRCVGWpn_wRLySFKoDO4#5h$dMOAjS zmV)F%YI;aRT2w$)inLA>1rbV!KyA4olUQyf#|gGecmkfGPr$K9)JNciIB=*?kDNGi zKpZ%*W5ND(-kAUgh?4IIM07d zk*diFRg+kPR#H%ISsgZtAV{)tDipD-ZRC-nQN?@h+D?E)qL9x5`6?JKL%m9Btqg>* z2=ww8V7}kPkJ_9g=(s&rIo5Qo;lOi73imsf?~g)>d)p(2-zs1_7j+-4C~*Ll_%RP! zD8XtVcT5d%BSRkJSzF2&A+|eqb+h$Mgqb=-PS^ zT7Hn_K;8Dj!PxHGDZS4`*XZK8OBeN=E~G!gw@A;=tnpX1tm1rcdQ znq%GWv)#@3x#%C^F4zMG_MFA`gs}-T9oA&bVRa|MOp~#v;M5s=90Bu{#je?y?RQ@A zqz}2%XvW|Kz700BeE<*M4Bc>lcM_F0r|10h66*5$E#Q$H#@47erX*7G9qz5acVFED znhx8JNcE<{9 literal 0 HcmV?d00001 diff --git a/deltacat/tests/utils/test_pyarrow.py b/deltacat/tests/utils/test_pyarrow.py index 90a2a838..58afe5d5 100644 --- a/deltacat/tests/utils/test_pyarrow.py +++ b/deltacat/tests/utils/test_pyarrow.py @@ -16,6 +16,7 @@ import pyarrow as pa PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet" +PARQUET_GZIP_COMPRESSED_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet.gz" EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv" NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv" OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = ( @@ -789,3 +790,25 @@ def test_s3_file_to_table_when_parquet_schema_overridden(self): self.assertEqual(field.name, schema.field(index).name) self.assertEqual(result.schema.field(1).type, "string") + + def test_s3_file_to_table_when_parquet_gzip(self): + + pa_kwargs_provider = lambda content_type, kwargs: { + "reader_type": "pyarrow", + **kwargs, + } + + result = s3_file_to_table( + PARQUET_GZIP_COMPRESSED_FILE_PATH, + ContentType.PARQUET.value, + ContentEncoding.GZIP.value, + ["n_legs", "animal"], + ["n_legs"], + pa_read_func_kwargs_provider=pa_kwargs_provider, + ) + + self.assertEqual(len(result), 6) + self.assertEqual(len(result.column_names), 1) + schema = result.schema + schema_index = schema.get_field_index("n_legs") + self.assertEqual(schema.field(schema_index).type, "int64") diff --git a/deltacat/utils/pyarrow.py b/deltacat/utils/pyarrow.py index 653ac614..31167cc6 100644 --- a/deltacat/utils/pyarrow.py +++ b/deltacat/utils/pyarrow.py @@ -569,8 +569,8 @@ def s3_file_to_table( **s3_client_kwargs, ) - if READER_TYPE_KWARG in kwargs: - kwargs.pop(READER_TYPE_KWARG) + if READER_TYPE_KWARG in kwargs: + kwargs.pop(READER_TYPE_KWARG, None) filesystem = io if s3_url.startswith("s3://"):