Handle non-utf8 CSVs (#29)

* add a failing test of latin-1 CSV loading * Propose that we just tolerate bad encodings * and actually use the technique we have chosen * delete_on_close is a newer python feature; instead, flush instead of closing * Add an example of ignore_errors
opendp · Oct 2, 2024 · a8fd36c · a8fd36c
1 parent bb4aca4
commit a8fd36c
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 2 deletions.
diff --git a/dp_creator_ii/templates/context.py b/dp_creator_ii/templates/context.py
@@ -1,5 +1,5 @@
 context = dp.Context.compositor(
-    data=pl.scan_csv(CSV_PATH),
+    data=pl.scan_csv(CSV_PATH, encoding="utf8-lossy"),
     privacy_unit=dp.unit_of(contributions=UNIT),
     privacy_loss=dp.loss_of(epsilon=LOSS),
     split_by_weights=WEIGHTS,

diff --git a/dp_creator_ii/tests/test_csv.py b/dp_creator_ii/tests/test_csv.py
@@ -0,0 +1,44 @@
+import csv
+import polars as pl
+import polars.testing
+import tempfile
+import pytest
+
+
+@pytest.mark.parametrize("encoding", ["latin1", "utf8"])
+def test_csv_loading(encoding):
+    """
+    This isn't really a test of our code: rather, it demonstrates the pattern
+    we plan to follow. (Though if we do decide to require the encoding from
+    the user, or use chardet to sniff the encoding, that should be tested here.)
+    """
+    with tempfile.NamedTemporaryFile(mode="w", newline="", encoding=encoding) as fp:
+        old_lf = pl.DataFrame({"NAME": ["André"], "AGE": [42]}).lazy()
+
+        writer = csv.writer(fp)
+        writer.writerow(["NAME", "AGE"])
+        for row in old_lf.collect().rows():
+            writer.writerow(row)
+        fp.flush()
+
+        # w/o "ignore_errors=True" it fails outright.
+        # We could ignore_errors:
+        new_default_lf = pl.scan_csv(fp.name, ignore_errors=True)
+        if encoding == "utf8":
+            polars.testing.assert_frame_equal(old_lf, new_default_lf)
+        if encoding != "utf8":
+            polars.testing.assert_frame_not_equal(old_lf, new_default_lf)
+            assert new_default_lf.collect().rows()[0] == (None, 42)
+
+        # But we retain more information with utf8-lossy:
+        new_lossy_lf = pl.scan_csv(fp.name, encoding="utf8-lossy")
+        if encoding == "utf8":
+            polars.testing.assert_frame_equal(old_lf, new_lossy_lf)
+        if encoding != "utf8":
+            polars.testing.assert_frame_not_equal(old_lf, new_lossy_lf)
+            assert new_lossy_lf.collect().rows()[0] == ("Andr�", 42)
+            # If the file even has non-utf8 characters,
+            # they are probably not the only thing that distinguishes
+            # two strings that we want to group on.
+            # Besides grouping, we don't do much with strings,
+            # so this feels safe.
diff --git a/dp_creator_ii/tests/test_template.py b/dp_creator_ii/tests/test_template.py
@@ -21,7 +21,7 @@ def test_fill_template():
             }
         )
     )
-    assert f"data=pl.scan_csv('{fake_csv}')" in context_block
+    assert f"data=pl.scan_csv('{fake_csv}', encoding=\"utf8-lossy\")" in context_block
 
 
 def test_fill_template_unfilled_slots():
-Original file line number
+Diff line change
@@ Expand Up / @@ -21,7 +21,7 @@ def test_fill_template(): @@
                 }
             )
         )
-        assert f"data=pl.scan_csv('{fake_csv}')" in context_block
+        assert f"data=pl.scan_csv('{fake_csv}', encoding=\"utf8-lossy\")" in context_block
     def test_fill_template_unfilled_slots():
@@ Expand Down @@