-
Notifications
You must be signed in to change notification settings - Fork 175
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[FEAT] [Native I/O] Add a native CSV reader. (#1475)
This PR adds a simple native CSV reader. Local reads are fully async, while remote object store reads currently bulk-download each file into a byte buffer; streaming remote read support will be added in a follow-up PR. **NOTE:** This PR required some changes to Arrow2's async CSV reading machinery, namely: 1. Schema inference was broken for headerless CSV files ([commit](Eventual-Inc/arrow2@a14e8c7)). 2. Type inference was broken for CSV columns that contain nulls ([commit](Eventual-Inc/arrow2@065a31d)). The requisite changes are contained on this branch: https://github.com/jorgecarleitao/arrow2/compare/main...Eventual-Inc:arrow2:clark/async-csv-fixes?expand=1 ## TODOs (follow-up PRs) - [ ] Add streaming remote reads - [ ] Parallelize column/chunk deserialization Closes #1462
- Loading branch information
1 parent
278e7cd
commit 553a911
Showing
20 changed files
with
879 additions
and
146 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
[dependencies] | ||
arrow2 = {workspace = true, features = ["io_csv", "io_csv_async"]} | ||
async-compat = {workspace = true} | ||
async-stream = {workspace = true} | ||
bytes = {workspace = true} | ||
common-error = {path = "../common/error", default-features = false} | ||
csv-async = "1.2.6" | ||
daft-core = {path = "../daft-core", default-features = false} | ||
daft-io = {path = "../daft-io", default-features = false} | ||
daft-table = {path = "../daft-table", default-features = false} | ||
futures = {workspace = true} | ||
log = {workspace = true} | ||
pyo3 = {workspace = true, optional = true} | ||
pyo3-log = {workspace = true, optional = true} | ||
rayon = {workspace = true} | ||
snafu = {workspace = true} | ||
tokio = {workspace = true} | ||
tokio-stream = {workspace = true} | ||
tokio-util = {workspace = true} | ||
|
||
[features] | ||
default = ["python"] | ||
python = ["dep:pyo3", "dep:pyo3-log", "common-error/python", "daft-core/python", "daft-io/python", "daft-table/python"] | ||
|
||
[package] | ||
edition = {workspace = true} | ||
name = "daft-csv" | ||
version = {workspace = true} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
#![feature(async_closure)] | ||
#![feature(let_chains)] | ||
use common_error::DaftError; | ||
use snafu::Snafu; | ||
|
||
pub mod metadata; | ||
#[cfg(feature = "python")] | ||
pub mod python; | ||
pub mod read; | ||
#[cfg(feature = "python")] | ||
pub use python::register_modules; | ||
|
||
#[derive(Debug, Snafu)] | ||
pub enum Error { | ||
#[snafu(display("{source}"))] | ||
IOError { source: daft_io::Error }, | ||
#[snafu(display("{source}"))] | ||
CSVError { source: csv_async::Error }, | ||
} | ||
|
||
impl From<Error> for DaftError { | ||
fn from(err: Error) -> DaftError { | ||
match err { | ||
Error::IOError { source } => source.into(), | ||
_ => DaftError::External(err.into()), | ||
} | ||
} | ||
} | ||
|
||
impl From<daft_io::Error> for Error { | ||
fn from(err: daft_io::Error) -> Self { | ||
Error::IOError { source: err } | ||
} | ||
} |
Oops, something went wrong.