-
Notifications
You must be signed in to change notification settings - Fork 174
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[FEAT] Add smart planning of ScanTasks starting with merging by files…
…izes (#1692) Refactors/changes required on ScanTask itself: 1. Added a `ScanTask::merge` 2. Added a `ScanTask::partition_spec()` 3. Added some validation in `ScanTask::new` to assert that all the underlying sources have the same partition spec I then added a new module `daft_scan::scan_task_iterators` which contains functions that perform transformations on a `Box<dyn Iterator<item = DaftResult<ScanTaskRef>>>`. TODO: - [x] Make the file_size configurable (as an environment variable/context flag) so that our unit-tests still run correctly when we do multi-file tests for multi-partition dataframes (see: #1700 ) --------- Co-authored-by: Jay Chia <[email protected]@users.noreply.github.com>
- Loading branch information
Showing
21 changed files
with
543 additions
and
37 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
[dependencies] | ||
lazy_static = {workspace = true} | ||
pyo3 = {workspace = true, optional = true} | ||
|
||
[features] | ||
default = ["python"] | ||
python = ["dep:pyo3"] | ||
|
||
[package] | ||
edition = {workspace = true} | ||
name = "common-daft-config" | ||
version = {workspace = true} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#[derive(Clone)] | ||
pub struct DaftConfig { | ||
pub merge_scan_tasks_min_size_bytes: usize, | ||
pub merge_scan_tasks_max_size_bytes: usize, | ||
} | ||
|
||
impl Default for DaftConfig { | ||
fn default() -> Self { | ||
DaftConfig { | ||
merge_scan_tasks_min_size_bytes: 64 * 1024 * 1024, // 64MB | ||
merge_scan_tasks_max_size_bytes: 512 * 1024 * 1024, // 512MB | ||
} | ||
} | ||
} | ||
|
||
#[cfg(feature = "python")] | ||
mod python; | ||
|
||
#[cfg(feature = "python")] | ||
pub use python::PyDaftConfig; | ||
|
||
#[cfg(feature = "python")] | ||
use pyo3::prelude::*; | ||
|
||
#[cfg(feature = "python")] | ||
pub fn register_modules(_py: Python, parent: &PyModule) -> PyResult<()> { | ||
parent.add_class::<python::PyDaftConfig>()?; | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
use std::sync::Arc; | ||
|
||
use pyo3::prelude::*; | ||
|
||
use crate::DaftConfig; | ||
|
||
#[derive(Clone, Default)] | ||
#[pyclass] | ||
pub struct PyDaftConfig { | ||
pub config: Arc<DaftConfig>, | ||
} | ||
|
||
#[pymethods] | ||
impl PyDaftConfig { | ||
#[new] | ||
pub fn new() -> Self { | ||
PyDaftConfig::default() | ||
} | ||
|
||
fn with_config_values( | ||
&mut self, | ||
merge_scan_tasks_min_size_bytes: Option<usize>, | ||
merge_scan_tasks_max_size_bytes: Option<usize>, | ||
) -> PyResult<PyDaftConfig> { | ||
let mut config = self.config.as_ref().clone(); | ||
|
||
if let Some(merge_scan_tasks_max_size_bytes) = merge_scan_tasks_max_size_bytes { | ||
config.merge_scan_tasks_max_size_bytes = merge_scan_tasks_max_size_bytes; | ||
} | ||
if let Some(merge_scan_tasks_min_size_bytes) = merge_scan_tasks_min_size_bytes { | ||
config.merge_scan_tasks_min_size_bytes = merge_scan_tasks_min_size_bytes; | ||
} | ||
|
||
Ok(PyDaftConfig { | ||
config: Arc::new(config), | ||
}) | ||
} | ||
|
||
#[getter(merge_scan_tasks_min_size_bytes)] | ||
fn get_merge_scan_tasks_min_size_bytes(&self) -> PyResult<usize> { | ||
Ok(self.config.merge_scan_tasks_min_size_bytes) | ||
} | ||
|
||
#[getter(merge_scan_tasks_max_size_bytes)] | ||
fn get_merge_scan_tasks_max_size_bytes(&self) -> PyResult<usize> { | ||
Ok(self.config.merge_scan_tasks_max_size_bytes) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.