Skip to content

Commit

Permalink
Makes delimiter, quote, escape and, comment optional all the way from…
Browse files Browse the repository at this point in the history
… python till the reader
  • Loading branch information
subygan committed Nov 16, 2023
1 parent c6cc179 commit 7c27ded
Show file tree
Hide file tree
Showing 7 changed files with 21 additions and 40 deletions.
8 changes: 4 additions & 4 deletions daft/io/_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ def read_csv(
schema_hints: Optional[Dict[str, DataType]] = None,
has_headers: bool = True,
column_names: Optional[List[str]] = None,
delimiter: str = ',',
delimiter: Optional[str] = None,
double_quote: bool = True,
quote: str = '\"',
escape_char: str = '\"',
comment: str = "#",
quote: Optional[str] = None,
escape_char: Optional[str] = None,
comment: Optional[str] = None,
io_config: Optional["IOConfig"] = None,
use_native_downloader: bool = True,
_buffer_size: Optional[int] = None,
Expand Down
23 changes: 2 additions & 21 deletions src/daft-csv/test/iris_tiny_escape.csv
Original file line number Diff line number Diff line change
@@ -1,21 +1,2 @@
"sepal.\"length\"","sepal.width","petal.length","petal.width","variety"
5.1,3.5,1.4,.2,"Setosa"
4.9,3,1.4,.2,"Setosa"
4.7,3.2,1.3,.2,"Setosa"
4.6,3.1,1.5,.2,"Se\"to\"sa"
5,3.6,1.4,.2,"Seto\"\"sa"
5.4,3.9,1.7,.4,"Setosa"
4.6,3.4,1.4,.3,"Setosa"
5,3.4,1.5,.2,"Setosa"
4.4,2.9,1.4,.2,"Setosa"
4.9,3.1,1.5,.1,"Setosa"
5.4,3.7,1.5,.2,"Setosa"
4.8,3.4,1.6,.2,"Setosa"
4.8,3,1.4,.1,"Setosa"
4.3,3,1.1,.1,"Setosa"
5.8,4,1.2,.2,"Setosa"
5.7,4.4,1.5,.4,"Setosa"
5.4,3.9,1.3,.4,"Setosa"
5.1,3.5,1.4,.3,"Setosa"
5.7,3.8,1.7,.3,"Setosa"
5.1,3.8,1.5,.3,"Setosa"
"date32","date64","timestamp_s","timestamp_ms","timestamp_us","timestamp_s_utc_tz","timestamp_ms_utc_tz","timestamp_us_utc_tz","timestamp_s_tz","timestamp_ms_tz","timestamp_us_tz"
1970-01-02,1970-01-01,1970-01-01 00:00:01,1970-01-01 00:00:00.001,1970-01-01 00:00:00.000001,1970-01-01 00:00:01Z,1970-01-01 00:00:00.001Z,1970-01-01 00:00:00.000001Z,1970-01-01 07:30:01+0730,1970-01-01 07:30:00.001+0730,1970-01-01 07:30:00.000001+0730
6 changes: 3 additions & 3 deletions src/daft-micropartition/src/micropartition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ pub(crate) enum TableState {
Loaded(Arc<Vec<Table>>),
}

pub fn char_to_byte(char_val: char) -> Result<Option<u8>, Error> {
pub fn char_to_byte(char_val: Option<char>) -> Result<Option<u8>, Error> {

match u8::try_from(char_val){
Err(_e) => Err(Error::WrongChar{val: char_val}),
match u8::try_from(char_val.unwrap()){
Err(_e) => Err(Error::WrongChar{val: char_val.unwrap()}),
Ok(char_val) => Ok(Some(char_val)),
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/daft-micropartition/src/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,7 @@ pub(crate) fn read_csv_into_py_table(
py: Python,
uri: &str,
has_header: bool,
delimiter: char,
delimiter: Option<char>,
double_quote: bool,
schema: PySchema,
storage_config: PyStorageConfig,
Expand Down
16 changes: 8 additions & 8 deletions src/daft-scan/src/file_format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,12 @@ impl_bincode_py_state_serialization!(ParquetSourceConfig);
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Hash)]
#[cfg_attr(feature = "python", pyclass(module = "daft.daft", get_all))]
pub struct CsvSourceConfig {
pub delimiter: char,
pub delimiter: Option<char>,
pub has_headers: bool,
pub double_quote: bool,
pub quote: char,
pub escape_char: char,
pub comment: char,
pub quote: Option<char>,
pub escape_char: Option<char>,
pub comment: Option<char>,
pub buffer_size: Option<usize>,
pub chunk_size: Option<usize>,
}
Expand All @@ -129,12 +129,12 @@ impl CsvSourceConfig {
/// * `chunk_size` - Size of the chunks (in bytes) deserialized in parallel by the streaming reader.
#[new]
fn new(
delimiter: char,
delimiter: Option<char>,
has_headers: bool,
double_quote: bool,
quote: char,
escape_char: char,
comment: char,
quote: Option<char>,
escape_char: Option<char>,
comment: Option<char>,
buffer_size: Option<usize>,
chunk_size: Option<usize>,
) -> PyResult<Self> {
Expand Down
4 changes: 2 additions & 2 deletions src/daft-scan/src/glob.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,9 @@ fn run_glob(
Ok(Box::new(iterator))
}

fn char_to_byte(char_val: char) -> Result<Option<u8>, DaftError> {
fn char_to_byte(char_val: Option<char>) -> Result<Option<u8>, DaftError> {

match u8::try_from(char_val){
match u8::try_from(char_val.unwrap()){
Err(_e) => Err(DaftError::ValueError(format!(
"character is not valid : {:?}",
char_val
Expand Down
2 changes: 1 addition & 1 deletion tests/table/table_io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def test_csv_read_data_custom_escape(use_native_downloader):
expected = Table.from_pydict(
{
"id": [1,2, 3],
"data": ["a\"a\"a",'aa','aa'],
"data": ['a\"a\"a','aa','aa'],
}
)
table = table_io.read_csv(
Expand Down

0 comments on commit 7c27ded

Please sign in to comment.