fix typo and reduce 1 alloc

Eventual-Inc · Sep 26, 2024 · 355c05d · 355c05d
1 parent 3321272
commit 355c05d
Show file tree

Hide file tree

Showing 6 changed files with 33 additions and 29 deletions.
diff --git a/daft/io/_parquet.py b/daft/io/_parquet.py
@@ -49,7 +49,7 @@ def read_parquet(
         file_path_column: Include the source path(s) as a column with this name. Defaults to None.
         use_native_downloader: Whether to use the native downloader instead of PyArrow for reading Parquet.
         coerce_int96_timestamp_unit: TimeUnit to coerce Int96 TimeStamps to. e.g.: [ns, us, ms], Defaults to None.
-        _multithreaded_io: Include the source path(s) as a column called
+        _multithreaded_io: Whether to use multithreading for IO threads. Setting this to False can be helpful in reducing
             the amount of system resources (number of connections and thread contention) when running in the Ray runner.
             Defaults to None, which will let Daft decide based on the runner it is currently using.
 

diff --git a/src/daft-csv/src/read.rs b/src/daft-csv/src/read.rs
@@ -80,7 +80,7 @@ pub fn read_csv_bulk(
     multithreaded_io: bool,
     max_chunks_in_flight: Option<usize>,
     num_parallel_tasks: usize,
-    file_path_column: Option<String>,
+    file_path_column: Option<&str>,
 ) -> DaftResult<Vec<Table>> {
     let runtime_handle = get_runtime(multithreaded_io)?;
     let tables = runtime_handle.block_on_current_thread(async move {
@@ -101,7 +101,7 @@ pub fn read_csv_bulk(
                 read_options.clone(),
                 io_client.clone(),
                 io_stats.clone(),
-                file_path_column.clone(),
+                file_path_column.map(|s| s.to_string()),
             );
             tokio::task::spawn(async move {
                 read_csv_single_into_table(
@@ -112,7 +112,7 @@ pub fn read_csv_bulk(
                     io_client,
                     io_stats,
                     max_chunks_in_flight,
-                    file_path_column,
+                    file_path_column.as_deref(),
                 )
                 .await
             })
@@ -220,7 +220,7 @@ async fn read_csv_single_into_table(
     io_client: Arc<IOClient>,
     io_stats: Option<IOStatsRef>,
     max_chunks_in_flight: Option<usize>,
-    file_path_column: Option<String>,
+    file_path_column: Option<&str>,
 ) -> DaftResult<Table> {
     let predicate = convert_options
         .as_ref()
@@ -340,9 +340,10 @@ async fn read_csv_single_into_table(
         }
     }?;
     if let Some(file_path_col_name) = file_path_column {
+        let trimmed = uri.trim_start_matches("file://");
         let file_paths_column = Utf8Array::from_iter(
-            file_path_col_name.as_str(),
-            std::iter::repeat(Some(uri.trim_start_matches("file://"))).take(output_table.len()),
+            file_path_col_name,
+            std::iter::repeat(Some(trimmed)).take(output_table.len()),
         )
         .into_series();
         return output_table.union(&Table::from_nonempty_columns(vec![file_paths_column])?);

diff --git a/src/daft-json/src/local.rs b/src/daft-json/src/local.rs
@@ -28,7 +28,7 @@ pub fn read_json_local(
     parse_options: Option<JsonParseOptions>,
     read_options: Option<JsonReadOptions>,
     max_chunks_in_flight: Option<usize>,
-    file_path_column: Option<String>,
+    file_path_column: Option<&str>,
 ) -> DaftResult<Table> {
     let uri = uri.trim_start_matches("file://");
     let file = std::fs::File::open(uri)?;
@@ -46,9 +46,10 @@ pub fn read_json_local(
     )?;
     let output_table = reader.finish()?;
     if let Some(file_path_col_name) = file_path_column {
+        let trimmed = uri.trim_start_matches("file://");
         let file_paths_column = Utf8Array::from_iter(
-            file_path_col_name.as_str(),
-            std::iter::repeat(Some(uri.trim_start_matches("file://"))).take(output_table.len()),
+            file_path_col_name,
+            std::iter::repeat(Some(trimmed)).take(output_table.len()),
         )
         .into_series();
         return output_table.union(&Table::from_nonempty_columns(vec![file_paths_column])?);

diff --git a/src/daft-json/src/read.rs b/src/daft-json/src/read.rs
@@ -73,7 +73,7 @@ pub fn read_json_bulk(
     multithreaded_io: bool,
     max_chunks_in_flight: Option<usize>,
     num_parallel_tasks: usize,
-    file_path_column: Option<String>,
+    file_path_column: Option<&str>,
 ) -> DaftResult<Vec<Table>> {
     let runtime_handle = get_runtime(multithreaded_io)?;
     let tables = runtime_handle.block_on_current_thread(async move {
@@ -94,7 +94,7 @@ pub fn read_json_bulk(
                 read_options.clone(),
                 io_client.clone(),
                 io_stats.clone(),
-                file_path_column.clone(),
+                file_path_column.map(|s| s.to_string()),
             );
             tokio::task::spawn(async move {
                 let table = read_json_single_into_table(
@@ -105,7 +105,7 @@ pub fn read_json_bulk(
                     io_client,
                     io_stats,
                     max_chunks_in_flight,
-                    file_path_column,
+                    file_path_column.as_deref(),
                 )
                 .await?;
                 DaftResult::Ok(table)
@@ -189,7 +189,7 @@ async fn read_json_single_into_table(
     io_client: Arc<IOClient>,
     io_stats: Option<IOStatsRef>,
     max_chunks_in_flight: Option<usize>,
-    file_path_column: Option<String>,
+    file_path_column: Option<&str>,
 ) -> DaftResult<Table> {
     let (source_type, fixed_uri) = parse_url(uri)?;
     let is_compressed = CompressionCodec::from_uri(uri).is_some();
@@ -303,9 +303,10 @@ async fn read_json_single_into_table(
         }
     }?;
     if let Some(file_path_col_name) = file_path_column {
+        let trimmed = uri.trim_start_matches("file://");
         let file_paths_column = Utf8Array::from_iter(
-            file_path_col_name.as_str(),
-            std::iter::repeat(Some(uri.trim_start_matches("file://"))).take(output_table.len()),
+            file_path_col_name,
+            std::iter::repeat(Some(trimmed)).take(output_table.len()),
         )
         .into_series();
         return output_table.union(&Table::from_nonempty_columns(vec![file_paths_column])?);

diff --git a/src/daft-micropartition/src/micropartition.rs b/src/daft-micropartition/src/micropartition.rs
@@ -181,7 +181,7 @@ fn materialize_scan_task(
                         metadatas,
                         Some(delete_map),
                         *chunk_size,
-                        scan_task.file_path_column.clone(),
+                        scan_task.file_path_column.as_deref(),
                     )
                     .context(DaftCoreComputeSnafu)?
                 }
@@ -236,7 +236,7 @@ fn materialize_scan_task(
                         native_storage_config.multithreaded_io,
                         None,
                         8,
-                        scan_task.file_path_column.clone(),
+                        scan_task.file_path_column.as_deref(),
                     )
                     .context(DaftCoreComputeSnafu)?
                 }
@@ -267,7 +267,7 @@ fn materialize_scan_task(
                         native_storage_config.multithreaded_io,
                         None,
                         8,
-                        scan_task.file_path_column.clone(),
+                        scan_task.file_path_column.as_deref(),
                     )
                     .context(DaftCoreComputeSnafu)?
                 }
@@ -656,7 +656,7 @@ impl MicroPartition {
                     field_id_mapping.clone(),
                     parquet_metadata,
                     chunk_size,
-                    scan_task.file_path_column.clone(),
+                    scan_task.file_path_column.as_deref(),
                 )
                 .context(DaftCoreComputeSnafu)
             }
@@ -1039,7 +1039,7 @@ fn _read_parquet_into_loaded_micropartition<T: AsRef<str>>(
     catalog_provided_schema: Option<SchemaRef>,
     field_id_mapping: Option<Arc<BTreeMap<i32, Field>>>,
     chunk_size: Option<usize>,
-    file_path_column: Option<String>,
+    file_path_column: Option<&str>,
 ) -> DaftResult<MicroPartition> {
     let delete_map = iceberg_delete_files
         .map(|files| {
@@ -1125,7 +1125,7 @@ pub(crate) fn read_parquet_into_micropartition<T: AsRef<str>>(
     field_id_mapping: Option<Arc<BTreeMap<i32, Field>>>,
     parquet_metadata: Option<Vec<Arc<FileMetaData>>>,
     chunk_size: Option<usize>,
-    file_path_column: Option<String>,
+    file_path_column: Option<&str>,
 ) -> DaftResult<MicroPartition> {
     if let Some(so) = start_offset
         && so > 0
@@ -1315,7 +1315,7 @@ pub(crate) fn read_parquet_into_micropartition<T: AsRef<str>>(
                 }),
                 num_rows,
             ),
-            file_path_column,
+            file_path_column.map(|s| s.to_string()),
         );
 
         let fill_map = scan_task.partition_spec().map(|pspec| pspec.to_fill_map());

diff --git a/src/daft-parquet/src/read.rs b/src/daft-parquet/src/read.rs
@@ -156,7 +156,7 @@ async fn read_parquet_single(
     metadata: Option<Arc<FileMetaData>>,
     delete_rows: Option<Vec<i64>>,
     chunk_size: Option<usize>,
-    file_path_column: Option<String>,
+    file_path_column: Option<&str>,
 ) -> DaftResult<Table> {
     let field_id_mapping_provided = field_id_mapping.is_some();
     let mut columns_to_read = columns.clone();
@@ -358,9 +358,10 @@ async fn read_parquet_single(
     }
 
     if let Some(file_path_col_name) = file_path_column {
+        let trimmed = uri.trim_start_matches("file://");
         let file_paths_column = Utf8Array::from_iter(
-            file_path_col_name.as_str(),
-            std::iter::repeat(Some(uri.trim_start_matches("file://"))).take(table.len()),
+            file_path_col_name,
+            std::iter::repeat(Some(trimmed)).take(table.len()),
         )
         .into_series();
         return table.union(&Table::from_nonempty_columns(vec![file_paths_column])?);
@@ -762,7 +763,7 @@ pub fn read_parquet_bulk<T: AsRef<str>>(
     metadata: Option<Vec<Arc<FileMetaData>>>,
     delete_map: Option<HashMap<String, Vec<i64>>>,
     chunk_size: Option<usize>,
-    file_path_column: Option<String>,
+    file_path_column: Option<&str>,
 ) -> DaftResult<Vec<Table>> {
     let runtime_handle = daft_io::get_runtime(multithreaded_io)?;
 
@@ -790,7 +791,7 @@ pub fn read_parquet_bulk<T: AsRef<str>>(
                 let schema_infer_options = *schema_infer_options;
                 let owned_field_id_mapping = field_id_mapping.clone();
                 let delete_rows = delete_map.as_ref().and_then(|m| m.get(&uri).cloned());
-                let owned_file_path_column = file_path_column.clone();
+                let owned_file_path_column = file_path_column.map(|s| s.to_string());
                 tokio::task::spawn(async move {
                     read_parquet_single(
                         &uri,
@@ -806,7 +807,7 @@ pub fn read_parquet_bulk<T: AsRef<str>>(
                         metadata,
                         delete_rows,
                         chunk_size,
-                        owned_file_path_column,
+                        owned_file_path_column.as_deref(),
                     )
                     .await
                 })