[FEAT] Support disabling using doubled quotes to escape in CSV (#1544)

Closes #1507. This PR adds the `double_quote_escape: bool` option to the `daft.read_csv` and `daft.read_csv_schema` functions. This allows the disabling of using double quotes i.e. `""` to escape quotes within a CSV file being read by Daft. ## Remaining Tasks - [x] Write tests - [x] Review further interfaces (I'm not sure if those two functions cover all CSV use-cases)
Eventual-Inc · Oct 31, 2023 · 52b806e · 52b806e
1 parent 9248c5b
commit 52b806e
Show file tree

Hide file tree

Showing 15 changed files with 271 additions and 27 deletions.
diff --git a/daft/daft.pyi b/daft/daft.pyi
@@ -190,13 +190,15 @@ class CsvSourceConfig:
 
     delimiter: str
     has_headers: bool
+    double_quote: bool
     buffer_size: int | None
     chunk_size: int | None
 
     def __init__(
         self,
         delimiter: str,
         has_headers: bool,
+        double_quote: bool,
         buffer_size: int | None = None,
         chunk_size: int | None = None,
     ): ...
@@ -432,6 +434,7 @@ def read_csv(
     num_rows: int | None = None,
     has_header: bool | None = None,
     delimiter: str | None = None,
+    double_quote: bool | None = None,
     io_config: IOConfig | None = None,
     multithreaded_io: bool | None = None,
     schema: PySchema | None = None,
@@ -442,6 +445,7 @@ def read_csv_schema(
     uri: str,
     has_header: bool | None = None,
     delimiter: str | None = None,
+    double_quote: bool | None = None,
     io_config: IOConfig | None = None,
     multithreaded_io: bool | None = None,
 ): ...
@@ -778,6 +782,7 @@ class PyMicroPartition:
         num_rows: int | None = None,
         has_header: bool | None = None,
         delimiter: str | None = None,
+        double_quote: bool | None = None,
         io_config: IOConfig | None = None,
         multithreaded_io: bool | None = None,
         schema: PySchema | None = None,

diff --git a/daft/execution/execution_step.py b/daft/execution/execution_step.py
@@ -373,6 +373,7 @@ def _handle_tabular_files_scan(
                         csv_options=TableParseCSVOptions(
                             delimiter=format_config.delimiter,
                             header_index=0 if format_config.has_headers else None,
+                            double_quote=format_config.double_quote,
                             buffer_size=format_config.buffer_size,
                             chunk_size=format_config.chunk_size,
                         ),

diff --git a/daft/io/_csv.py b/daft/io/_csv.py
@@ -23,6 +23,7 @@ def read_csv(
     has_headers: bool = True,
     column_names: Optional[List[str]] = None,
     delimiter: str = ",",
+    double_quote: bool = True,
     io_config: Optional["IOConfig"] = None,
     use_native_downloader: bool = True,
     _buffer_size: Optional[int] = None,
@@ -42,6 +43,7 @@ def read_csv(
             disable all schema inference on data being read, and throw an error if data being read is incompatible.
         has_headers (bool): Whether the CSV has a header or not, defaults to True
         delimiter (Str): Delimiter used in the CSV, defaults to ","
+        doubled_quote_escape (bool): Whether to support double quote escapes, defaults to True
         io_config (IOConfig): Config to be used with the native downloader
         use_native_downloader: Whether to use the native downloader instead of PyArrow for reading Parquet. This
             is currently experimental.
@@ -62,6 +64,7 @@ def read_csv(
     csv_config = CsvSourceConfig(
         delimiter=delimiter,
         has_headers=has_headers,
+        double_quote=double_quote,
         buffer_size=_buffer_size,
         chunk_size=_chunk_size,
     )

diff --git a/daft/logical/schema.py b/daft/logical/schema.py
@@ -166,6 +166,7 @@ def from_csv(
         path: str,
         has_header: bool | None = None,
         delimiter: str | None = None,
+        double_quote: bool | None = None,
         io_config: IOConfig | None = None,
         multithreaded_io: bool | None = None,
     ) -> Schema:
@@ -174,6 +175,7 @@ def from_csv(
                 uri=path,
                 has_header=has_header,
                 delimiter=delimiter,
+                double_quote=double_quote,
                 io_config=io_config,
                 multithreaded_io=multithreaded_io,
             )

diff --git a/daft/runners/partitioning.py b/daft/runners/partitioning.py
@@ -44,12 +44,14 @@ class TableParseCSVOptions:
     Args:
         delimiter: The delimiter to use when parsing CSVs, defaults to ","
         header_index: Index of the header row, or None if no header
+        double_quote: Whether to support escaping quotes by doubling them, defaults to True
         buffer_size: Size of the buffer (in bytes) used by the streaming reader.
         chunk_size: Size of the chunks (in bytes) deserialized in parallel by the streaming reader.
     """
 
     delimiter: str = ","
     header_index: int | None = 0
+    double_quote: bool = True
     buffer_size: int | None = None
     chunk_size: int | None = None
 

diff --git a/daft/table/micropartition.py b/daft/table/micropartition.py
@@ -351,6 +351,7 @@ def read_csv(
         num_rows: int | None = None,
         has_header: bool | None = None,
         delimiter: str | None = None,
+        double_quote: bool | None = None,
         io_config: IOConfig | None = None,
         multithreaded_io: bool | None = None,
         schema: Schema | None = None,
@@ -365,6 +366,7 @@ def read_csv(
                 num_rows=num_rows,
                 has_header=has_header,
                 delimiter=delimiter,
+                double_quote=double_quote,
                 io_config=io_config,
                 multithreaded_io=multithreaded_io,
                 schema=schema._schema if schema is not None else None,

diff --git a/daft/table/table.py b/daft/table/table.py
@@ -448,6 +448,7 @@ def read_csv(
         num_rows: int | None = None,
         has_header: bool | None = None,
         delimiter: str | None = None,
+        double_quote: bool | None = None,
         io_config: IOConfig | None = None,
         multithreaded_io: bool | None = None,
         schema: Schema | None = None,
@@ -462,6 +463,7 @@ def read_csv(
                 num_rows=num_rows,
                 has_header=has_header,
                 delimiter=delimiter,
+                double_quote=double_quote,
                 io_config=io_config,
                 multithreaded_io=multithreaded_io,
                 schema=schema._schema if schema is not None else None,

diff --git a/daft/table/table_io.py b/daft/table/table_io.py
@@ -222,6 +222,7 @@ def read_csv(
                 num_rows=read_options.num_rows,
                 has_header=has_header,
                 delimiter=csv_options.delimiter,
+                double_quote=csv_options.double_quote,
                 io_config=config.io_config,
                 schema=schema,
                 buffer_size=csv_options.buffer_size,