From 09e0f5631ae9dd0265bd0b449a52727c7b96f3e9 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Wed, 7 Apr 2021 16:04:03 +0100 Subject: [PATCH] Make sure we can handle universal newlines at inference time and add a test for Mac-style newlines. --- splitgraph/ingestion/csv/common.py | 10 ++++++- test/resources/ingestion/csv/mac_newlines.csv | 1 + test/splitgraph/ingestion/test_csv.py | 29 +++++++++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 test/resources/ingestion/csv/mac_newlines.csv diff --git a/splitgraph/ingestion/csv/common.py b/splitgraph/ingestion/csv/common.py index e82f341a..ae871f65 100644 --- a/splitgraph/ingestion/csv/common.py +++ b/splitgraph/ingestion/csv/common.py @@ -54,9 +54,17 @@ def autodetect_csv(stream: io.RawIOBase, csv_options: CSVOptions) -> CSVOptions: assert data if csv_options.autodetect_encoding: - csv_options = csv_options._replace(encoding=chardet.detect(data)["encoding"]) + encoding = chardet.detect(data)["encoding"] + if encoding == "ascii": + # ASCII is a subset of UTF-8. For safety, if chardet detected + # the encoding as ASCII, use UTF-8 (a valid ASCII file is a valid UTF-8 file, + # but not vice versa) + encoding = "utf-8" + csv_options = csv_options._replace(encoding=encoding) sample = data.decode(csv_options.encoding) + # Emulate universal newlines mode (convert \r, \r\n, \n into \n) + sample = "\n".join(sample.splitlines()) if csv_options.autodetect_dialect: dialect = csv.Sniffer().sniff(sample) diff --git a/test/resources/ingestion/csv/mac_newlines.csv b/test/resources/ingestion/csv/mac_newlines.csv new file mode 100644 index 00000000..784a1d3f --- /dev/null +++ b/test/resources/ingestion/csv/mac_newlines.csv @@ -0,0 +1 @@ +fruit_id,timestamp,name 1,2018-01-01 00:11:11,apple 2,2018-01-02 00:22:22,orange 3,2018-01-03 00:33:33,mayonnaise 4,2018-01-04 00:44:44,mustard \ No newline at end of file diff --git a/test/splitgraph/ingestion/test_csv.py b/test/splitgraph/ingestion/test_csv.py index 2420900a..dfb43d0d 100644 --- a/test/splitgraph/ingestion/test_csv.py +++ b/test/splitgraph/ingestion/test_csv.py @@ -190,6 +190,9 @@ def test_csv_dialect_encoding_inference(): # TODO: we keep these in the dialect struct rather than extract back out into the # CSVOptions. Might need to do the latter if we want to return the proposed FDW table # params to the user. + + # Note this line terminator is always "\r\n" since CSV assumes we use the + # universal newlines mode. assert options.dialect.lineterminator == "\r\n" assert options.dialect.delimiter == ";" @@ -212,3 +215,29 @@ def test_csv_dialect_encoding_inference(): ordinal=3, name="TEXT", pg_type="character varying", is_pk=False, comment=None ), ] + + +def test_csv_mac_newlines(): + # Test a CSV file with old Mac-style newlines (\r) + + with open(os.path.join(INGESTION_RESOURCES_CSV, "mac_newlines.csv"), "rb") as f: + options = CSVOptions() + options, reader = make_csv_reader(f, options) + + assert options.encoding == "utf-8" + assert options.header is True + + data = list(reader) + assert len(data) == 5 + assert data[0] == ["fruit_id", "timestamp", "name"] + + schema = generate_column_names(infer_sg_schema(data)) + assert schema == [ + TableColumn(ordinal=1, name="fruit_id", pg_type="integer", is_pk=False, comment=None), + TableColumn( + ordinal=2, name="timestamp", pg_type="timestamp", is_pk=False, comment=None + ), + TableColumn( + ordinal=3, name="name", pg_type="character varying", is_pk=False, comment=None + ), + ]