Skip to content

Commit

Permalink
[FEAT] Support disabling using doubled quotes to escape in CSV (#1544)
Browse files Browse the repository at this point in the history
Closes #1507.

This PR adds the `double_quote_escape: bool` option to the
`daft.read_csv` and `daft.read_csv_schema` functions. This allows the
disabling of using double quotes i.e. `""` to escape quotes within a CSV
file being read by Daft.

## Remaining Tasks

- [x] Write tests
- [x] Review further interfaces (I'm not sure if those two functions
cover all CSV use-cases)
  • Loading branch information
ravern authored Oct 31, 2023
1 parent 9248c5b commit 52b806e
Show file tree
Hide file tree
Showing 15 changed files with 271 additions and 27 deletions.
5 changes: 5 additions & 0 deletions daft/daft.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -190,13 +190,15 @@ class CsvSourceConfig:

delimiter: str
has_headers: bool
double_quote: bool
buffer_size: int | None
chunk_size: int | None

def __init__(
self,
delimiter: str,
has_headers: bool,
double_quote: bool,
buffer_size: int | None = None,
chunk_size: int | None = None,
): ...
Expand Down Expand Up @@ -432,6 +434,7 @@ def read_csv(
num_rows: int | None = None,
has_header: bool | None = None,
delimiter: str | None = None,
double_quote: bool | None = None,
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
schema: PySchema | None = None,
Expand All @@ -442,6 +445,7 @@ def read_csv_schema(
uri: str,
has_header: bool | None = None,
delimiter: str | None = None,
double_quote: bool | None = None,
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
): ...
Expand Down Expand Up @@ -778,6 +782,7 @@ class PyMicroPartition:
num_rows: int | None = None,
has_header: bool | None = None,
delimiter: str | None = None,
double_quote: bool | None = None,
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
schema: PySchema | None = None,
Expand Down
1 change: 1 addition & 0 deletions daft/execution/execution_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,7 @@ def _handle_tabular_files_scan(
csv_options=TableParseCSVOptions(
delimiter=format_config.delimiter,
header_index=0 if format_config.has_headers else None,
double_quote=format_config.double_quote,
buffer_size=format_config.buffer_size,
chunk_size=format_config.chunk_size,
),
Expand Down
3 changes: 3 additions & 0 deletions daft/io/_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def read_csv(
has_headers: bool = True,
column_names: Optional[List[str]] = None,
delimiter: str = ",",
double_quote: bool = True,
io_config: Optional["IOConfig"] = None,
use_native_downloader: bool = True,
_buffer_size: Optional[int] = None,
Expand All @@ -42,6 +43,7 @@ def read_csv(
disable all schema inference on data being read, and throw an error if data being read is incompatible.
has_headers (bool): Whether the CSV has a header or not, defaults to True
delimiter (Str): Delimiter used in the CSV, defaults to ","
doubled_quote_escape (bool): Whether to support double quote escapes, defaults to True
io_config (IOConfig): Config to be used with the native downloader
use_native_downloader: Whether to use the native downloader instead of PyArrow for reading Parquet. This
is currently experimental.
Expand All @@ -62,6 +64,7 @@ def read_csv(
csv_config = CsvSourceConfig(
delimiter=delimiter,
has_headers=has_headers,
double_quote=double_quote,
buffer_size=_buffer_size,
chunk_size=_chunk_size,
)
Expand Down
2 changes: 2 additions & 0 deletions daft/logical/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def from_csv(
path: str,
has_header: bool | None = None,
delimiter: str | None = None,
double_quote: bool | None = None,
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
) -> Schema:
Expand All @@ -174,6 +175,7 @@ def from_csv(
uri=path,
has_header=has_header,
delimiter=delimiter,
double_quote=double_quote,
io_config=io_config,
multithreaded_io=multithreaded_io,
)
Expand Down
2 changes: 2 additions & 0 deletions daft/runners/partitioning.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,14 @@ class TableParseCSVOptions:
Args:
delimiter: The delimiter to use when parsing CSVs, defaults to ","
header_index: Index of the header row, or None if no header
double_quote: Whether to support escaping quotes by doubling them, defaults to True
buffer_size: Size of the buffer (in bytes) used by the streaming reader.
chunk_size: Size of the chunks (in bytes) deserialized in parallel by the streaming reader.
"""

delimiter: str = ","
header_index: int | None = 0
double_quote: bool = True
buffer_size: int | None = None
chunk_size: int | None = None

Expand Down
2 changes: 2 additions & 0 deletions daft/table/micropartition.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ def read_csv(
num_rows: int | None = None,
has_header: bool | None = None,
delimiter: str | None = None,
double_quote: bool | None = None,
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
schema: Schema | None = None,
Expand All @@ -365,6 +366,7 @@ def read_csv(
num_rows=num_rows,
has_header=has_header,
delimiter=delimiter,
double_quote=double_quote,
io_config=io_config,
multithreaded_io=multithreaded_io,
schema=schema._schema if schema is not None else None,
Expand Down
2 changes: 2 additions & 0 deletions daft/table/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,7 @@ def read_csv(
num_rows: int | None = None,
has_header: bool | None = None,
delimiter: str | None = None,
double_quote: bool | None = None,
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
schema: Schema | None = None,
Expand All @@ -462,6 +463,7 @@ def read_csv(
num_rows=num_rows,
has_header=has_header,
delimiter=delimiter,
double_quote=double_quote,
io_config=io_config,
multithreaded_io=multithreaded_io,
schema=schema._schema if schema is not None else None,
Expand Down
1 change: 1 addition & 0 deletions daft/table/table_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ def read_csv(
num_rows=read_options.num_rows,
has_header=has_header,
delimiter=csv_options.delimiter,
double_quote=csv_options.double_quote,
io_config=config.io_config,
schema=schema,
buffer_size=csv_options.buffer_size,
Expand Down
Loading

0 comments on commit 52b806e

Please sign in to comment.