From 1d21bf9b8f353cf89243d69815fb021c49b70f3e Mon Sep 17 00:00:00 2001 From: Sammy Sidhu Date: Mon, 30 Oct 2023 21:33:26 +0400 Subject: [PATCH] [BUG] add sort after running passes (#1545) * Verified fixes parquet reading from s3 on `19.parquet` * Bug occurred when none of our passes made a change to the set of ranges so then the new rangelist didn't overwrite the current value. --- src/daft-parquet/src/read_planner.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/daft-parquet/src/read_planner.rs b/src/daft-parquet/src/read_planner.rs index f9615024c7..76e557fef6 100644 --- a/src/daft-parquet/src/read_planner.rs +++ b/src/daft-parquet/src/read_planner.rs @@ -144,16 +144,20 @@ impl ReadPlanner { self.ranges = ranges; } } - Ok(()) } pub fn collect( - self, + mut self, io_client: Arc, io_stats: Option, ) -> DaftResult> { let mut entries = Vec::with_capacity(self.ranges.len()); + + // We have to sort again to maintain the invariant of the list being sorted after running passes + // We also have to do this before the loop so we spawn tokio tasks front to back of the file + self.ranges.sort_by_key(|v| v.start); + for range in self.ranges { let owned_io_client = io_client.clone(); let owned_url = self.source.clone();