-
Notifications
You must be signed in to change notification settings - Fork 175
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[FEAT] Anonymous Scan Operator (#1526)
- Loading branch information
Showing
4 changed files
with
180 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
use std::fmt::Display; | ||
|
||
use common_error::DaftResult; | ||
use daft_core::schema::SchemaRef; | ||
|
||
use crate::{DataFileSource, FileType, ScanOperator, ScanOperatorRef, ScanTask}; | ||
#[derive(Debug)] | ||
pub struct AnonymousScanOperator { | ||
schema: SchemaRef, | ||
file_type: FileType, | ||
files: Vec<String>, | ||
columns_to_select: Option<Vec<String>>, | ||
limit: Option<usize>, | ||
} | ||
|
||
impl AnonymousScanOperator { | ||
pub fn new(schema: SchemaRef, file_type: FileType, files: Vec<String>) -> Self { | ||
Self { | ||
schema, | ||
file_type, | ||
files, | ||
columns_to_select: None, | ||
limit: None, | ||
} | ||
} | ||
} | ||
|
||
impl Display for AnonymousScanOperator { | ||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||
write!(f, "{:#?}", self) | ||
} | ||
} | ||
|
||
impl ScanOperator for AnonymousScanOperator { | ||
fn schema(&self) -> SchemaRef { | ||
self.schema.clone() | ||
} | ||
|
||
fn partitioning_keys(&self) -> &[daft_core::datatypes::Field] { | ||
&[] | ||
} | ||
|
||
fn num_partitions(&self) -> common_error::DaftResult<usize> { | ||
Ok(self.files.len()) | ||
} | ||
|
||
fn select(self: Box<Self>, columns: &[&str]) -> common_error::DaftResult<ScanOperatorRef> { | ||
for c in columns { | ||
if self.schema.get_field(c).is_err() { | ||
return Err(common_error::DaftError::FieldNotFound(format!( | ||
"{c} not found in {:?}", | ||
self.columns_to_select | ||
))); | ||
} | ||
} | ||
let mut to_rtn = self; | ||
to_rtn.columns_to_select = Some(columns.iter().map(|s| s.to_string()).collect()); | ||
Ok(to_rtn) | ||
} | ||
|
||
fn limit(self: Box<Self>, num: usize) -> DaftResult<ScanOperatorRef> { | ||
let mut to_rtn = self; | ||
to_rtn.limit = Some(num); | ||
Ok(to_rtn) | ||
} | ||
|
||
fn filter(self: Box<Self>, _predicate: &daft_dsl::Expr) -> DaftResult<(bool, ScanOperatorRef)> { | ||
Ok((false, self)) | ||
} | ||
|
||
fn to_scan_tasks( | ||
self: Box<Self>, | ||
) -> DaftResult<Box<dyn Iterator<Item = DaftResult<crate::ScanTask>>>> { | ||
let iter = self.files.clone().into_iter().map(move |f| { | ||
let source = DataFileSource::AnonymousDataFile { | ||
file_type: self.file_type, | ||
path: f, | ||
metadata: None, | ||
partition_spec: None, | ||
statistics: None, | ||
}; | ||
Ok(ScanTask { | ||
source, | ||
columns: self.columns_to_select.clone(), | ||
limit: self.limit, | ||
}) | ||
}); | ||
Ok(Box::new(iter)) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
use pyo3::prelude::*; | ||
|
||
pub mod pylib { | ||
use pyo3::prelude::*; | ||
use std::str::FromStr; | ||
|
||
use daft_core::python::schema::PySchema; | ||
|
||
use pyo3::pyclass; | ||
|
||
use crate::anonymous::AnonymousScanOperator; | ||
use crate::FileType; | ||
use crate::ScanOperatorRef; | ||
|
||
#[pyclass(module = "daft.daft", frozen)] | ||
pub(crate) struct ScanOperator { | ||
scan_op: ScanOperatorRef, | ||
} | ||
|
||
#[pymethods] | ||
impl ScanOperator { | ||
pub fn __repr__(&self) -> PyResult<String> { | ||
Ok(format!("{}", self.scan_op)) | ||
} | ||
|
||
#[staticmethod] | ||
pub fn anonymous_scan( | ||
schema: PySchema, | ||
file_type: &str, | ||
files: Vec<String>, | ||
) -> PyResult<Self> { | ||
let schema = schema.schema; | ||
let operator = Box::new(AnonymousScanOperator::new( | ||
schema, | ||
FileType::from_str(file_type)?, | ||
files, | ||
)); | ||
Ok(ScanOperator { scan_op: operator }) | ||
} | ||
} | ||
} | ||
|
||
pub fn register_modules(_py: Python, parent: &PyModule) -> PyResult<()> { | ||
parent.add_class::<pylib::ScanOperator>()?; | ||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters