Skip to content

Commit

Permalink
Propose that we just tolerate bad encodings
Browse files Browse the repository at this point in the history
  • Loading branch information
mccalluc committed Oct 1, 2024
1 parent 7c84659 commit d21fd26
Showing 1 changed file with 13 additions and 8 deletions.
21 changes: 13 additions & 8 deletions dp_creator_ii/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,24 @@
@pytest.mark.parametrize("encoding", ["latin1", "utf8"])
def test_csv_loading(encoding):
with tempfile.NamedTemporaryFile(
delete=False, mode="w", newline="", encoding=encoding
delete_on_close=False, mode="w", newline="", encoding=encoding
) as fp:
# By default, would delete file on "close()";
# With "delete=False", clean up when exiting "with" instead.
old_lf = pl.DataFrame({"NAME": ["André"], "AGE": [42]}).lazy()

writer = csv.writer(
fp,
)
writer = csv.writer(fp)
writer.writerow(["NAME", "AGE"])
for row in old_lf.collect().rows():
writer.writerow(row)
fp.close()

new_lf = pl.scan_csv(fp.name)
polars.testing.assert_frame_equal(old_lf, new_lf)
new_lf = pl.scan_csv(fp.name, encoding="utf8-lossy")
if encoding == "utf8":
polars.testing.assert_frame_equal(old_lf, new_lf)
if encoding != "utf8":
polars.testing.assert_frame_not_equal(old_lf, new_lf)
assert new_lf.collect().rows()[0] == ("Andr�", 42)
# If the file even has non-utf8 characters,
# they are probably not the only thing that distinguishes
# two strings that we want to group on.
# Besides grouping, we don't do much with strings,
# so this feels safe.

0 comments on commit d21fd26

Please sign in to comment.