Skip to content

Commit

Permalink
Update DuckDB to ^v0.8.0 (#92)
Browse files Browse the repository at this point in the history
* update duckdb

* remove experimental setting which is now a default

* update read csv to expect headers and comma delim

duckdb 0.8.x is now more permissive of csv reads and now requires further specificity in order to emit exceptions with csv's that have values but include missing headers.
  • Loading branch information
d33bs authored Aug 24, 2023
1 parent 0563891 commit 29464c1
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 59 deletions.
4 changes: 2 additions & 2 deletions cytotable/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def _get_table_chunk_offsets(
_duckdb_reader()
.execute(
# nosec
f"SELECT COUNT(*) from read_csv_auto('{source_path}')"
f"SELECT COUNT(*) from read_csv_auto('{source_path}', header=TRUE, delim=',')"
if source_type == ".csv"
else f"SELECT COUNT(*) from sqlite_scan('{source_path}', '{table_name}')"
)
Expand Down Expand Up @@ -299,7 +299,7 @@ def _source_chunk_to_parquet(
# build output query and filepath base
# (chunked output will append offset to keep output paths unique)
if str(AnyPath(source["source_path"]).suffix).lower() == ".csv":
base_query = f"SELECT {select_columns} FROM read_csv_auto('{str(source['source_path'])}')"
base_query = f"SELECT {select_columns} FROM read_csv_auto('{str(source['source_path'])}', header=TRUE, delim=',')"
result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}"

elif str(AnyPath(source["source_path"]).suffix).lower() == ".sqlite":
Expand Down
7 changes: 0 additions & 7 deletions cytotable/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,13 +192,6 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
https://duckdb.org/docs/sql/configuration#configuration-reference
*/
PRAGMA preserve_insertion_order=FALSE;
/*
Allow parallel csv reads for performance increase possibilities
See the following for more information:
https://duckdb.org/docs/sql/configuration#configuration-reference
*/
PRAGMA experimental_parallel_csv=TRUE;
""",
)

Expand Down
103 changes: 54 additions & 49 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ packages = [{include = "cytotable"}]
python = ">=3.8,<3.13"
pyarrow = "^12.0.0"
cloudpathlib = {extras = ["all"], version = "^0.13.0"}
duckdb = "^0.7.0"
duckdb = "^0.8.0"
parsl = "^2023.4.24"

[tool.poetry.dev-dependencies]
Expand Down

0 comments on commit 29464c1

Please sign in to comment.