diff --git a/src/daft-parquet/src/file.rs b/src/daft-parquet/src/file.rs index 84dfa5a83e..730d0b3315 100644 --- a/src/daft-parquet/src/file.rs +++ b/src/daft-parquet/src/file.rs @@ -313,10 +313,9 @@ pub struct ParquetFileReader { impl ParquetFileReader { const DEFAULT_CHUNK_SIZE: usize = 2048; - // Set to a very high number 256MB to guard against unbounded large - // downloads from remote storage, which likely indicates corrupted Parquet data - // See: https://github.com/Eventual-Inc/Daft/issues/1551 - const MAX_HEADER_SIZE: usize = 256 * 1024 * 1024; + // Set to 2GB because that's the maximum size of strings allowable by Parquet (using i32 offsets). + // See issue: https://github.com/Eventual-Inc/Daft/issues/3007 + const MAX_PAGE_SIZE: usize = 2 * 1024 * 1024 * 1024; fn new( uri: String, @@ -473,7 +472,7 @@ impl ParquetFileReader { range_reader, vec![], Arc::new(|_, _| true), - Self::MAX_HEADER_SIZE, + Self::MAX_PAGE_SIZE, ) .with_context( |_| UnableToCreateParquetPageStreamSnafu:: { @@ -638,7 +637,7 @@ impl ParquetFileReader { range_reader, vec![], Arc::new(|_, _| true), - Self::MAX_HEADER_SIZE, + Self::MAX_PAGE_SIZE, ) .with_context(|_| { UnableToCreateParquetPageStreamSnafu:: { @@ -821,7 +820,7 @@ impl ParquetFileReader { range_reader, vec![], Arc::new(|_, _| true), - Self::MAX_HEADER_SIZE, + Self::MAX_PAGE_SIZE, ) .with_context(|_| { UnableToCreateParquetPageStreamSnafu:: { diff --git a/src/parquet2/src/read/page/stream.rs b/src/parquet2/src/read/page/stream.rs index 25fb0fe6fc..523fc335b6 100644 --- a/src/parquet2/src/read/page/stream.rs +++ b/src/parquet2/src/read/page/stream.rs @@ -37,7 +37,7 @@ pub async fn get_page_stream_from_column_start<'a, R: AsyncRead + Unpin + Send>( reader: &'a mut R, scratch: Vec, pages_filter: PageFilter, - max_header_size: usize, + max_page_size: usize, ) -> Result> + 'a> { let page_metadata: PageMetaData = column_metadata.into(); Ok(_get_page_stream( @@ -47,7 +47,7 @@ pub async fn get_page_stream_from_column_start<'a, R: AsyncRead + Unpin + Send>( page_metadata.descriptor, scratch, pages_filter, - max_header_size, + max_page_size, )) } @@ -56,7 +56,7 @@ pub fn get_owned_page_stream_from_column_start( reader: R, scratch: Vec, pages_filter: PageFilter, - max_header_size: usize, + max_page_size: usize, ) -> Result>> { let page_metadata: PageMetaData = column_metadata.into(); Ok(_get_owned_page_stream( @@ -66,7 +66,7 @@ pub fn get_owned_page_stream_from_column_start( page_metadata.descriptor, scratch, pages_filter, - max_header_size, + max_page_size, )) }