Propose that we just tolerate bad encodings

opendp · Oct 1, 2024 · d21fd26 · d21fd26
1 parent 7c84659
commit d21fd26
Showing 1 changed file with 13 additions and 8 deletions.
diff --git a/dp_creator_ii/tests/test_csv.py b/dp_creator_ii/tests/test_csv.py
@@ -8,19 +8,24 @@
 @pytest.mark.parametrize("encoding", ["latin1", "utf8"])
 def test_csv_loading(encoding):
     with tempfile.NamedTemporaryFile(
-        delete=False, mode="w", newline="", encoding=encoding
+        delete_on_close=False, mode="w", newline="", encoding=encoding
     ) as fp:
-        # By default, would delete file on "close()";
-        # With "delete=False", clean up when exiting "with" instead.
         old_lf = pl.DataFrame({"NAME": ["André"], "AGE": [42]}).lazy()
 
-        writer = csv.writer(
-            fp,
-        )
+        writer = csv.writer(fp)
         writer.writerow(["NAME", "AGE"])
         for row in old_lf.collect().rows():
             writer.writerow(row)
         fp.close()
 
-        new_lf = pl.scan_csv(fp.name)
-        polars.testing.assert_frame_equal(old_lf, new_lf)
+        new_lf = pl.scan_csv(fp.name, encoding="utf8-lossy")
+        if encoding == "utf8":
+            polars.testing.assert_frame_equal(old_lf, new_lf)
+        if encoding != "utf8":
+            polars.testing.assert_frame_not_equal(old_lf, new_lf)
+            assert new_lf.collect().rows()[0] == ("Andr�", 42)
+            # If the file even has non-utf8 characters,
+            # they are probably not the only thing that distinguishes
+            # two strings that we want to group on.
+            # Besides grouping, we don't do much with strings,
+            # so this feels safe.