Skip to content

Commit

Permalink
[BUG] Bump up max_header_size (#3068)
Browse files Browse the repository at this point in the history
# Overview
Bump up the `MAX_HEADER_SIZE` constant that's being used to compare
against the page sizes.

**(Naming seems weird. We're using the max "header" size to guard
against "page" sizes.)**
  • Loading branch information
Raunak Bhagat authored Oct 23, 2024
1 parent d1b06fb commit c69ee3f
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 11 deletions.
13 changes: 6 additions & 7 deletions src/daft-parquet/src/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -313,10 +313,9 @@ pub struct ParquetFileReader {

impl ParquetFileReader {
const DEFAULT_CHUNK_SIZE: usize = 2048;
// Set to a very high number 256MB to guard against unbounded large
// downloads from remote storage, which likely indicates corrupted Parquet data
// See: https://github.com/Eventual-Inc/Daft/issues/1551
const MAX_HEADER_SIZE: usize = 256 * 1024 * 1024;
// Set to 2GB because that's the maximum size of strings allowable by Parquet (using i32 offsets).
// See issue: https://github.com/Eventual-Inc/Daft/issues/3007
const MAX_PAGE_SIZE: usize = 2 * 1024 * 1024 * 1024;

fn new(
uri: String,
Expand Down Expand Up @@ -473,7 +472,7 @@ impl ParquetFileReader {
range_reader,
vec![],
Arc::new(|_, _| true),
Self::MAX_HEADER_SIZE,
Self::MAX_PAGE_SIZE,
)
.with_context(
|_| UnableToCreateParquetPageStreamSnafu::<String> {
Expand Down Expand Up @@ -638,7 +637,7 @@ impl ParquetFileReader {
range_reader,
vec![],
Arc::new(|_, _| true),
Self::MAX_HEADER_SIZE,
Self::MAX_PAGE_SIZE,
)
.with_context(|_| {
UnableToCreateParquetPageStreamSnafu::<String> {
Expand Down Expand Up @@ -821,7 +820,7 @@ impl ParquetFileReader {
range_reader,
vec![],
Arc::new(|_, _| true),
Self::MAX_HEADER_SIZE,
Self::MAX_PAGE_SIZE,
)
.with_context(|_| {
UnableToCreateParquetPageStreamSnafu::<String> {
Expand Down
8 changes: 4 additions & 4 deletions src/parquet2/src/read/page/stream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ pub async fn get_page_stream_from_column_start<'a, R: AsyncRead + Unpin + Send>(
reader: &'a mut R,
scratch: Vec<u8>,
pages_filter: PageFilter,
max_header_size: usize,
max_page_size: usize,
) -> Result<impl Stream<Item = Result<CompressedPage>> + 'a> {
let page_metadata: PageMetaData = column_metadata.into();
Ok(_get_page_stream(
Expand All @@ -47,7 +47,7 @@ pub async fn get_page_stream_from_column_start<'a, R: AsyncRead + Unpin + Send>(
page_metadata.descriptor,
scratch,
pages_filter,
max_header_size,
max_page_size,
))
}

Expand All @@ -56,7 +56,7 @@ pub fn get_owned_page_stream_from_column_start<R: AsyncRead + Unpin + Send>(
reader: R,
scratch: Vec<u8>,
pages_filter: PageFilter,
max_header_size: usize,
max_page_size: usize,
) -> Result<impl Stream<Item = Result<CompressedPage>>> {
let page_metadata: PageMetaData = column_metadata.into();
Ok(_get_owned_page_stream(
Expand All @@ -66,7 +66,7 @@ pub fn get_owned_page_stream_from_column_start<R: AsyncRead + Unpin + Send>(
page_metadata.descriptor,
scratch,
pages_filter,
max_header_size,
max_page_size,
))
}

Expand Down

0 comments on commit c69ee3f

Please sign in to comment.