diff --git a/dp_creator_ii/templates/context.py b/dp_creator_ii/templates/context.py index b970a75..dc4e026 100644 --- a/dp_creator_ii/templates/context.py +++ b/dp_creator_ii/templates/context.py @@ -1,5 +1,5 @@ context = dp.Context.compositor( - data=pl.scan_csv(CSV_PATH), + data=pl.scan_csv(CSV_PATH, encoding="utf8-lossy"), privacy_unit=dp.unit_of(contributions=UNIT), privacy_loss=dp.loss_of(epsilon=LOSS), split_by_weights=WEIGHTS, diff --git a/dp_creator_ii/tests/test_csv.py b/dp_creator_ii/tests/test_csv.py new file mode 100644 index 0000000..bf59515 --- /dev/null +++ b/dp_creator_ii/tests/test_csv.py @@ -0,0 +1,44 @@ +import csv +import polars as pl +import polars.testing +import tempfile +import pytest + + +@pytest.mark.parametrize("encoding", ["latin1", "utf8"]) +def test_csv_loading(encoding): + """ + This isn't really a test of our code: rather, it demonstrates the pattern + we plan to follow. (Though if we do decide to require the encoding from + the user, or use chardet to sniff the encoding, that should be tested here.) + """ + with tempfile.NamedTemporaryFile(mode="w", newline="", encoding=encoding) as fp: + old_lf = pl.DataFrame({"NAME": ["André"], "AGE": [42]}).lazy() + + writer = csv.writer(fp) + writer.writerow(["NAME", "AGE"]) + for row in old_lf.collect().rows(): + writer.writerow(row) + fp.flush() + + # w/o "ignore_errors=True" it fails outright. + # We could ignore_errors: + new_default_lf = pl.scan_csv(fp.name, ignore_errors=True) + if encoding == "utf8": + polars.testing.assert_frame_equal(old_lf, new_default_lf) + if encoding != "utf8": + polars.testing.assert_frame_not_equal(old_lf, new_default_lf) + assert new_default_lf.collect().rows()[0] == (None, 42) + + # But we retain more information with utf8-lossy: + new_lossy_lf = pl.scan_csv(fp.name, encoding="utf8-lossy") + if encoding == "utf8": + polars.testing.assert_frame_equal(old_lf, new_lossy_lf) + if encoding != "utf8": + polars.testing.assert_frame_not_equal(old_lf, new_lossy_lf) + assert new_lossy_lf.collect().rows()[0] == ("Andr�", 42) + # If the file even has non-utf8 characters, + # they are probably not the only thing that distinguishes + # two strings that we want to group on. + # Besides grouping, we don't do much with strings, + # so this feels safe. diff --git a/dp_creator_ii/tests/test_template.py b/dp_creator_ii/tests/test_template.py index dd53b55..073c766 100644 --- a/dp_creator_ii/tests/test_template.py +++ b/dp_creator_ii/tests/test_template.py @@ -21,7 +21,7 @@ def test_fill_template(): } ) ) - assert f"data=pl.scan_csv('{fake_csv}')" in context_block + assert f"data=pl.scan_csv('{fake_csv}', encoding=\"utf8-lossy\")" in context_block def test_fill_template_unfilled_slots():