From a7bd5ec927485e0ee27846fe7c182d362b11658b Mon Sep 17 00:00:00 2001 From: Sammy Sidhu Date: Thu, 16 Nov 2023 21:38:01 -0800 Subject: [PATCH] [BUG] raise error if non valid parquet file (less than parquet footer size) (#1628) --- src/daft-parquet/src/lib.rs | 6 ++++++ src/daft-parquet/src/metadata.rs | 6 ++++++ src/daft-parquet/src/stream_reader.rs | 14 ++++++++++++++ tests/table/table_io/test_parquet.py | 12 ++++++++++++ 4 files changed, 38 insertions(+) diff --git a/src/daft-parquet/src/lib.rs b/src/daft-parquet/src/lib.rs index c2e078afa4..ff0dc57691 100644 --- a/src/daft-parquet/src/lib.rs +++ b/src/daft-parquet/src/lib.rs @@ -87,6 +87,12 @@ pub enum Error { footer ))] InvalidParquetFile { path: String, footer: Vec }, + #[snafu(display( + "File: {} is not a valid parquet file and is only {} bytes, smaller than the minimum size of 12 bytes", + path, + file_size + ))] + FileTooSmall { path: String, file_size: usize }, #[snafu(display( "File: {} has a footer size: {} greater than the file size: {}", path, diff --git a/src/daft-parquet/src/metadata.rs b/src/daft-parquet/src/metadata.rs index b2afc79d27..f0a940b147 100644 --- a/src/daft-parquet/src/metadata.rs +++ b/src/daft-parquet/src/metadata.rs @@ -20,6 +20,12 @@ pub(crate) async fn read_parquet_metadata( ) -> super::Result { const FOOTER_SIZE: usize = 8; const PARQUET_MAGIC: [u8; 4] = [b'P', b'A', b'R', b'1']; + if size < 12 { + return Err(Error::FileTooSmall { + path: uri.into(), + file_size: size, + }); + } /// The number of bytes read at the end of the parquet file on first read const DEFAULT_FOOTER_READ_SIZE: usize = 128 * 1024; diff --git a/src/daft-parquet/src/stream_reader.rs b/src/daft-parquet/src/stream_reader.rs index 9f65d19210..e0e7261d4c 100644 --- a/src/daft-parquet/src/stream_reader.rs +++ b/src/daft-parquet/src/stream_reader.rs @@ -64,6 +64,20 @@ pub(crate) fn local_parquet_read_into_arrow( let mut reader = File::open(uri).with_context(|_| super::InternalIOSnafu { path: uri.to_string(), })?; + let size = reader + .metadata() + .with_context(|_| super::InternalIOSnafu { + path: uri.to_string(), + })? + .len(); + + if size < 12 { + return Err(super::Error::FileTooSmall { + path: uri.into(), + file_size: size as usize, + }); + } + let metadata = read::read_metadata(&mut reader).with_context(|_| { super::UnableToParseMetadataFromLocalFileSnafu { path: uri.to_string(), diff --git a/tests/table/table_io/test_parquet.py b/tests/table/table_io/test_parquet.py index 84dc5c12c2..2becf7bd76 100644 --- a/tests/table/table_io/test_parquet.py +++ b/tests/table/table_io/test_parquet.py @@ -314,3 +314,15 @@ def test_parquet_read_int96_timestamps_schema_inference(coerce_to, store_schema) ) as f: schema = Schema.from_parquet(f, coerce_int96_timestamp_unit=coerce_to) assert schema == expected, f"Expected:\n{expected}\n\nReceived:\n{schema}" + + +@pytest.mark.parametrize("n_bytes", [0, 1, 2, 7]) +def test_read_empty_parquet_file(tmpdir, n_bytes): + + tmpdir = pathlib.Path(tmpdir) + file_path = tmpdir / "file.parquet" + with open(file_path, "wb") as f: + for _ in range(n_bytes): + f.write(b"0") + with pytest.raises(ValueError, match="smaller than the minimum size of 12 bytes"): + Table.read_parquet(file_path.as_posix())