Eventual-Inc · jaychia · Dec 3, 2023 · Nov 19, 2023 · Nov 21, 2023 · Nov 21, 2023
diff --git a/daft/daft.pyi b/daft/daft.pyi
@@ -443,7 +443,7 @@ class ScanOperatorHandle:
         glob_path: list[str],
         file_format_config: FileFormatConfig,
         storage_config: StorageConfig,
-        schema: PySchema | None = None,
+        schema_hint: PySchema | None = None,
     ) -> ScanOperatorHandle: ...
     @staticmethod
     def from_python_scan_operator(operator: ScanOperator) -> ScanOperatorHandle: ...
@@ -636,6 +636,7 @@ class PySchema:
     def __getitem__(self, name: str) -> PyField: ...
     def names(self) -> list[str]: ...
     def union(self, other: PySchema) -> PySchema: ...
+    def apply_hints(self, other: PySchema) -> PySchema: ...
     def eq(self, other: PySchema) -> bool: ...
     @staticmethod
     def from_field_name_and_types(names_and_types: list[tuple[str, PyDataType]]) -> PySchema: ...
@@ -917,9 +918,7 @@ class LogicalPlanBuilder:
         partition_key: str, cache_entry: PartitionCacheEntry, schema: PySchema, num_partitions: int
     ) -> LogicalPlanBuilder: ...
     @staticmethod
-    def table_scan_with_scan_operator(
-        scan_operator: ScanOperatorHandle, schema_hint: PySchema | None
-    ) -> LogicalPlanBuilder: ...
+    def table_scan_with_scan_operator(scan_operator: ScanOperatorHandle) -> LogicalPlanBuilder: ...
     @staticmethod
     def table_scan(
         file_infos: FileInfos, schema: PySchema, file_format_config: FileFormatConfig, storage_config: StorageConfig

diff --git a/daft/io/_iceberg.py b/daft/io/_iceberg.py
@@ -81,7 +81,5 @@
     iceberg_operator = IcebergScanOperator(pyiceberg_table, storage_config=storage_config)
 
     handle = ScanOperatorHandle.from_python_scan_operator(iceberg_operator)
-    builder = LogicalPlanBuilder.from_tabular_scan_with_scan_operator(
-        scan_operator=handle, schema_hint=iceberg_operator.schema()
-    )
+    builder = LogicalPlanBuilder.from_tabular_scan_with_scan_operator(scan_operator=handle)
     return DataFrame(builder)
diff --git a/daft/io/common.py b/daft/io/common.py
@@ -5,6 +5,7 @@
 
 from daft.context import get_context
 from daft.daft import (
+    CsvSourceConfig,
     FileFormatConfig,
     NativeStorageConfig,
     PythonStorageConfig,
@@ -55,38 +56,45 @@ def _get_tabular_files_scan(
                 path,
                 file_format_config,
                 storage_config,
-                schema=schema_hint._schema if schema_hint is not None else None,
+                schema_hint=schema_hint._schema if schema_hint is not None else None,
             )
         elif isinstance(path, str):
             scan_op = ScanOperatorHandle.glob_scan(
                 [path],
                 file_format_config,
                 storage_config,
-                schema=schema_hint._schema if schema_hint is not None else None,
+                schema_hint=schema_hint._schema if schema_hint is not None else None,
             )
         else:
             raise NotImplementedError(f"_get_tabular_files_scan cannot construct ScanOperatorHandle for input: {path}")
 
         builder = LogicalPlanBuilder.from_tabular_scan_with_scan_operator(
             scan_operator=scan_op,
-            schema_hint=schema_hint,
         )
         return builder
 
     paths = path if isinstance(path, list) else [str(path)]
     runner_io = get_context().runner().runner_io()
     file_infos = runner_io.glob_paths_details(paths, file_format_config=file_format_config, io_config=io_config)
 
-    # Infer schema if no hints provided
-    inferred_or_provided_schema = (
-        schema_hint
-        if schema_hint is not None
-        else runner_io.get_schema_from_first_filepath(file_infos, file_format_config, storage_config)
-    )
+    # Infer schema
+    schema = runner_io.get_schema_from_first_filepath(file_infos, file_format_config, storage_config)
+
+    # Apply hints from schema_hints if provided
+    if schema_hint is not None:
+        # If CSV and no headers, then use the schema hint as the schema
+        if isinstance(file_format_config.config, CsvSourceConfig) and file_format_config.config.has_headers == False:
+            if len(schema) != len(schema_hint):
+                raise ValueError(
 def test_create_dataframe_csv_specify_schema_no_headers( 
     valid_data: list[dict[str, float]], use_native_downloader 
 ) -> None: 
     with create_temp_filename() as fname: 
         with open(fname, "w") as f: 
             header = list(valid_data[0].keys()) 
             writer = csv.writer(f, delimiter="\t") 
             writer.writerows([[item[col] for col in header] for item in valid_data]) 
             f.flush() 
         df = daft.read_csv( 
             fname, 
             delimiter="\t", 
             schema_hints={ 
                 "sepal_length": DataType.float64(), 
                 "sepal_width": DataType.float64(), 
                 "petal_length": DataType.float64(), 
                 "petal_width": DataType.float64(), 
                 "variety": DataType.string(), 
             }, 
             has_headers=False, 
             use_native_downloader=use_native_downloader, 
         ) 
         assert df.column_names == COL_NAMES 
         pd_df = df.to_pandas() 
         assert list(pd_df.columns) == COL_NAMES 
         assert len(pd_df) == len(valid_data) 
 def test_create_dataframe_csv_specify_schema_no_headers( 
     valid_data: list[dict[str, float]], use_native_downloader 
 ) -> None: 
     with create_temp_filename() as fname: 
         with open(fname, "w") as f: 
             header = list(valid_data[0].keys()) 
             writer = csv.writer(f, delimiter="\t") 
             writer.writerows([[item[col] for col in header] for item in valid_data]) 
             f.flush() 
  
         df = daft.read_csv( 
             fname, 
             delimiter="\t", 
             schema_hints={ 
                 "sepal_length": DataType.float64(), 
                 "sepal_width": DataType.float64(), 
                 "petal_length": DataType.float64(), 
                 "petal_width": DataType.float64(), 
                 "variety": DataType.string(), 
             }, 
             has_headers=False, 
             use_native_downloader=use_native_downloader, 
         ) 
         assert df.column_names == COL_NAMES 
  
         pd_df = df.to_pandas() 
         assert list(pd_df.columns) == COL_NAMES 
         assert len(pd_df) == len(valid_data) 
+                    f"For CSV with no headers, number of columns in schema hint ({len(schema_hint)} columns were provided) must match number of columns in data: {len(schema)}."
+                )
+            schema = schema_hint
+        else:
+            schema = schema.apply_hints(schema_hint)
     # Construct plan
     builder = LogicalPlanBuilder.from_tabular_scan(
         file_infos=file_infos,
-        schema=inferred_or_provided_schema,
+        schema=schema,
         file_format_config=file_format_config,
         storage_config=storage_config,
     )

diff --git a/daft/logical/builder.py b/daft/logical/builder.py
@@ -83,10 +83,8 @@ def from_tabular_scan_with_scan_operator(
         cls,
         *,
         scan_operator: ScanOperatorHandle,
-        schema_hint: Schema | None,
     ) -> LogicalPlanBuilder:
-        pyschema = schema_hint._schema if schema_hint is not None else None
-        builder = _LogicalPlanBuilder.table_scan_with_scan_operator(scan_operator, pyschema)
+        builder = _LogicalPlanBuilder.table_scan_with_scan_operator(scan_operator)
         return cls(builder)
 
     @classmethod

diff --git a/daft/logical/schema.py b/daft/logical/schema.py
@@ -140,6 +140,12 @@
 
         return Schema._from_pyschema(self._schema.union(other._schema))
 
+    def apply_hints(self, other: Schema) -> Schema:
+        if not isinstance(other, Schema):
+            raise ValueError(f"Expected Schema, got other: {type(other)}")
+
+        return Schema._from_pyschema(self._schema.apply_hints(other._schema))
+
     def __reduce__(self) -> tuple:
         return Schema._from_pyschema, (self._schema,)
 

diff --git a/src/daft-core/src/python/schema.rs b/src/daft-core/src/python/schema.rs
@@ -33,6 +33,11 @@ impl PySchema {
         Ok(new_schema.into())
     }
 
+    pub fn apply_hints(&self, hints: &PySchema) -> PyResult<PySchema> {
+        let new_schema = Arc::new(self.schema.apply_hints(&hints.schema)?);
+        Ok(new_schema.into())
+    }
+
     pub fn eq(&self, other: &PySchema) -> PyResult<bool> {
         Ok(self.schema.fields.eq(&other.schema.fields))
     }

diff --git a/src/daft-core/src/schema.rs b/src/daft-core/src/schema.rs
@@ -86,6 +86,17 @@ impl Schema {
         }
     }
 
+    pub fn apply_hints(&self, hints: &Schema) -> DaftResult<Schema> {
+        let mut fields = IndexMap::new();
+        for (name, field) in self.fields.iter() {
+            match hints.fields.get(name) {
+                None => fields.insert(name.clone(), field.clone()),
+                Some(hint_field) => fields.insert(name.clone(), hint_field.clone()),
+            };
+        }
+        Ok(Schema { fields })
+    }
+
     pub fn to_arrow(&self) -> DaftResult<arrow2::datatypes::Schema> {
         let arrow_fields: DaftResult<Vec<arrow2::datatypes::Field>> =
             self.fields.iter().map(|(_, f)| f.to_arrow()).collect();

diff --git a/src/daft-micropartition/src/micropartition.rs b/src/daft-micropartition/src/micropartition.rs
@@ -348,6 +348,7 @@ impl MicroPartition {
                 read_parquet_into_micropartition(
                     uris.as_slice(),
                     columns.as_deref(),
+                    Some(schema),
                     None,
                     scan_task.pushdowns.limit,
                     row_groups,
@@ -615,6 +616,7 @@ pub(crate) fn read_csv_into_micropartition(
 pub(crate) fn read_parquet_into_micropartition(
     uris: &[&str],
     columns: Option<&[&str]>,
+    schema: Option<SchemaRef>,
     start_offset: Option<usize>,
     num_rows: Option<usize>,
     row_groups: Option<Vec<Option<Vec<i64>>>>,
@@ -659,8 +661,18 @@ pub(crate) fn read_parquet_into_micropartition(
     let schemas = metadata
         .iter()
         .map(|m| {
-            let schema = infer_schema_with_options(m, &Some((*schema_infer_options).into()))?;
-            let daft_schema = daft_core::schema::Schema::try_from(&schema)?;
+            // if schema provided use schema, else use inferred schema
+            let daft_schema = match schema.as_ref() {
+                Some(s) => Schema {
+                    fields: s.fields.clone(),
+                },
+                None => {
+                    let inferred_schema =
+                        infer_schema_with_options(m, &Some((*schema_infer_options).into()))?;
+                    daft_core::schema::Schema::try_from(&inferred_schema)?
+                }
+            };
+
             DaftResult::Ok(daft_schema)
         })
         .collect::<DaftResult<Vec<_>>>()?;

diff --git a/src/daft-micropartition/src/python.rs b/src/daft-micropartition/src/python.rs
@@ -438,6 +438,7 @@ impl PyMicroPartition {
             crate::micropartition::read_parquet_into_micropartition(
                 [uri].as_ref(),
                 columns.as_deref(),
+                None,
                 start_offset,
                 num_rows,
                 row_groups.map(|rg| vec![Some(rg)]),
@@ -476,6 +477,7 @@ impl PyMicroPartition {
             crate::micropartition::read_parquet_into_micropartition(
                 uris.as_ref(),
                 columns.as_deref(),
+                None,
                 start_offset,
                 num_rows,
                 row_groups,

diff --git a/src/daft-plan/src/builder.rs b/src/daft-plan/src/builder.rs
@@ -68,11 +68,8 @@ impl LogicalPlanBuilder {
         Ok(logical_plan.into())
     }
 
-    pub fn table_scan_with_scan_operator(
-        scan_operator: ScanOperatorRef,
-        schema_hint: Option<SchemaRef>,
-    ) -> DaftResult<Self> {
-        let schema = schema_hint.unwrap_or_else(|| scan_operator.0.schema());
+    pub fn table_scan_with_scan_operator(scan_operator: ScanOperatorRef) -> DaftResult<Self> {
+        let schema = scan_operator.0.schema();
         let partitioning_keys = scan_operator.0.partitioning_keys();
         let source_info =
             SourceInfo::ExternalInfo(ExternalSourceInfo::Scan(ScanExternalInfo::new(
@@ -298,15 +295,8 @@ impl PyLogicalPlanBuilder {
     }
 
     #[staticmethod]
-    pub fn table_scan_with_scan_operator(
-        scan_operator: ScanOperatorHandle,
-        schema_hint: Option<PySchema>,
-    ) -> PyResult<Self> {
-        Ok(LogicalPlanBuilder::table_scan_with_scan_operator(
-            scan_operator.into(),
-            schema_hint.map(|s| s.into()),
-        )?
-        .into())
+    pub fn table_scan_with_scan_operator(scan_operator: ScanOperatorHandle) -> PyResult<Self> {
+        Ok(LogicalPlanBuilder::table_scan_with_scan_operator(scan_operator.into())?.into())
     }
 
     #[staticmethod]