Skip to content

Commit

Permalink
[FEAT] Ellipsize glob scan paths (#2809)
Browse files Browse the repository at this point in the history
Resolves #2709
- Previously if there were multiple urls in the logical plan, it
cluttered the output of 'df.explain' with massive text. Now we add
ellipses if there are more than six, improving readability.
- The current test emulates multiple urls from the same test fixture
'mvp.parquet'. Ideally there should be a test fixture with multiple
parts of a parquet file.
  • Loading branch information
anmolsingh20 authored Sep 9, 2024
1 parent 5cf876a commit d30e62a
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 2 deletions.
21 changes: 20 additions & 1 deletion src/daft-scan/src/glob.rs
Original file line number Diff line number Diff line change
Expand Up @@ -272,9 +272,28 @@ impl ScanOperator for GlobScanOperator {
}

fn multiline_display(&self) -> Vec<String> {
let condensed_glob_paths = if self.glob_paths.len() <= 7 {
self.glob_paths.join(", ")
} else {
let first_three: Vec<String> = self.glob_paths.iter().take(3).cloned().collect();
let last_three: Vec<String> = self
.glob_paths
.iter()
.skip(self.glob_paths.len() - 3)
.cloned()
.collect();

let mut result = first_three.join(", ");
result.push_str(", ...");
result.push_str(", ");
result.push_str(&last_three.join(", "));

result
};

let mut lines = vec![
"GlobScanOperator".to_string(),
format!("Glob paths = [{}]", self.glob_paths.join(", ")),
format!("Glob paths = [{}]", condensed_glob_paths),
];
lines.extend(self.file_format_config.multiline_display());
lines.extend(self.storage_config.multiline_display());
Expand Down
41 changes: 40 additions & 1 deletion src/daft-scan/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -967,8 +967,9 @@ mod test {

use crate::{
file_format::{FileFormatConfig, ParquetSourceConfig},
glob::GlobScanOperator,
storage_config::{NativeStorageConfig, StorageConfig},
DataSource, Pushdowns, ScanTask,
DataSource, Pushdowns, ScanOperator, ScanTask,
};

fn make_scan_task(num_sources: usize) -> ScanTask {
Expand Down Expand Up @@ -1003,6 +1004,44 @@ mod test {
)
}

fn make_glob_scan_operator(num_sources: usize) -> GlobScanOperator {
let file_format_config: FileFormatConfig = FileFormatConfig::Parquet(ParquetSourceConfig {
coerce_int96_timestamp_unit: TimeUnit::Seconds,
field_id_mapping: None,
row_groups: None,
chunk_size: None,
});

let mut sources: Vec<String> = Vec::new();

for _ in 0..num_sources {
sources.push(format!("../../tests/assets/parquet-data/mvp.parquet"));
}

let glob_paths: Vec<&str> = sources.iter().map(|s| s.as_str()).collect();

let glob_scan_operator: GlobScanOperator = GlobScanOperator::try_new(
&glob_paths,
Arc::new(file_format_config),
Arc::new(StorageConfig::Native(Arc::new(
NativeStorageConfig::new_internal(false, None),
))),
false,
Some(Arc::new(Schema::empty())),
)
.unwrap();

glob_scan_operator
}

#[test]
fn test_glob_display_condenses() -> DaftResult<()> {
let glob_scan_operator: GlobScanOperator = make_glob_scan_operator(8);
let condensed_glob_paths: Vec<String> = glob_scan_operator.multiline_display();
assert_eq!(condensed_glob_paths[1], "Glob paths = [../../tests/assets/parquet-data/mvp.parquet, ../../tests/assets/parquet-data/mvp.parquet, ../../tests/assets/parquet-data/mvp.parquet, ..., ../../tests/assets/parquet-data/mvp.parquet, ../../tests/assets/parquet-data/mvp.parquet, ../../tests/assets/parquet-data/mvp.parquet]");
Ok(())
}

#[test]
fn test_display_condenses() -> DaftResult<()> {
let scan_task = make_scan_task(7);
Expand Down

0 comments on commit d30e62a

Please sign in to comment.