Skip to content

Commit

Permalink
Handle non-utf8 CSVs (#29)
Browse files Browse the repository at this point in the history
* add a failing test of latin-1 CSV loading

* Propose that we just tolerate bad encodings

* and actually use the technique we have chosen

* delete_on_close is a newer python feature; instead, flush instead of closing

* Add an example of ignore_errors
  • Loading branch information
mccalluc authored Oct 2, 2024
1 parent bb4aca4 commit a8fd36c
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 2 deletions.
2 changes: 1 addition & 1 deletion dp_creator_ii/templates/context.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
context = dp.Context.compositor(
data=pl.scan_csv(CSV_PATH),
data=pl.scan_csv(CSV_PATH, encoding="utf8-lossy"),
privacy_unit=dp.unit_of(contributions=UNIT),
privacy_loss=dp.loss_of(epsilon=LOSS),
split_by_weights=WEIGHTS,
Expand Down
44 changes: 44 additions & 0 deletions dp_creator_ii/tests/test_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import csv
import polars as pl
import polars.testing
import tempfile
import pytest


@pytest.mark.parametrize("encoding", ["latin1", "utf8"])
def test_csv_loading(encoding):
"""
This isn't really a test of our code: rather, it demonstrates the pattern
we plan to follow. (Though if we do decide to require the encoding from
the user, or use chardet to sniff the encoding, that should be tested here.)
"""
with tempfile.NamedTemporaryFile(mode="w", newline="", encoding=encoding) as fp:
old_lf = pl.DataFrame({"NAME": ["André"], "AGE": [42]}).lazy()

writer = csv.writer(fp)
writer.writerow(["NAME", "AGE"])
for row in old_lf.collect().rows():
writer.writerow(row)
fp.flush()

# w/o "ignore_errors=True" it fails outright.
# We could ignore_errors:
new_default_lf = pl.scan_csv(fp.name, ignore_errors=True)
if encoding == "utf8":
polars.testing.assert_frame_equal(old_lf, new_default_lf)
if encoding != "utf8":
polars.testing.assert_frame_not_equal(old_lf, new_default_lf)
assert new_default_lf.collect().rows()[0] == (None, 42)

# But we retain more information with utf8-lossy:
new_lossy_lf = pl.scan_csv(fp.name, encoding="utf8-lossy")
if encoding == "utf8":
polars.testing.assert_frame_equal(old_lf, new_lossy_lf)
if encoding != "utf8":
polars.testing.assert_frame_not_equal(old_lf, new_lossy_lf)
assert new_lossy_lf.collect().rows()[0] == ("Andr�", 42)
# If the file even has non-utf8 characters,
# they are probably not the only thing that distinguishes
# two strings that we want to group on.
# Besides grouping, we don't do much with strings,
# so this feels safe.
2 changes: 1 addition & 1 deletion dp_creator_ii/tests/test_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_fill_template():
}
)
)
assert f"data=pl.scan_csv('{fake_csv}')" in context_block
assert f"data=pl.scan_csv('{fake_csv}', encoding=\"utf8-lossy\")" in context_block


def test_fill_template_unfilled_slots():
Expand Down

0 comments on commit a8fd36c

Please sign in to comment.