Skip to content

Commit

Permalink
[BUG] raise error if non valid parquet file (less than parquet footer…
Browse files Browse the repository at this point in the history
… size) (#1628)
  • Loading branch information
samster25 authored Nov 17, 2023
1 parent b88cc8f commit a7bd5ec
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 0 deletions.
6 changes: 6 additions & 0 deletions src/daft-parquet/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,12 @@ pub enum Error {
footer
))]
InvalidParquetFile { path: String, footer: Vec<u8> },
#[snafu(display(
"File: {} is not a valid parquet file and is only {} bytes, smaller than the minimum size of 12 bytes",
path,
file_size
))]
FileTooSmall { path: String, file_size: usize },
#[snafu(display(
"File: {} has a footer size: {} greater than the file size: {}",
path,
Expand Down
6 changes: 6 additions & 0 deletions src/daft-parquet/src/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ pub(crate) async fn read_parquet_metadata(
) -> super::Result<FileMetaData> {
const FOOTER_SIZE: usize = 8;
const PARQUET_MAGIC: [u8; 4] = [b'P', b'A', b'R', b'1'];
if size < 12 {
return Err(Error::FileTooSmall {
path: uri.into(),
file_size: size,
});
}

/// The number of bytes read at the end of the parquet file on first read
const DEFAULT_FOOTER_READ_SIZE: usize = 128 * 1024;
Expand Down
14 changes: 14 additions & 0 deletions src/daft-parquet/src/stream_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,20 @@ pub(crate) fn local_parquet_read_into_arrow(
let mut reader = File::open(uri).with_context(|_| super::InternalIOSnafu {
path: uri.to_string(),
})?;
let size = reader
.metadata()
.with_context(|_| super::InternalIOSnafu {
path: uri.to_string(),
})?
.len();

if size < 12 {
return Err(super::Error::FileTooSmall {
path: uri.into(),
file_size: size as usize,
});
}

let metadata = read::read_metadata(&mut reader).with_context(|_| {
super::UnableToParseMetadataFromLocalFileSnafu {
path: uri.to_string(),
Expand Down
12 changes: 12 additions & 0 deletions tests/table/table_io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,3 +314,15 @@ def test_parquet_read_int96_timestamps_schema_inference(coerce_to, store_schema)
) as f:
schema = Schema.from_parquet(f, coerce_int96_timestamp_unit=coerce_to)
assert schema == expected, f"Expected:\n{expected}\n\nReceived:\n{schema}"


@pytest.mark.parametrize("n_bytes", [0, 1, 2, 7])
def test_read_empty_parquet_file(tmpdir, n_bytes):

tmpdir = pathlib.Path(tmpdir)
file_path = tmpdir / "file.parquet"
with open(file_path, "wb") as f:
for _ in range(n_bytes):
f.write(b"0")
with pytest.raises(ValueError, match="smaller than the minimum size of 12 bytes"):
Table.read_parquet(file_path.as_posix())

0 comments on commit a7bd5ec

Please sign in to comment.