From 09e0f5631ae9dd0265bd0b449a52727c7b96f3e9 Mon Sep 17 00:00:00 2001
From: Artjoms Iskovs <mildbyte@gmail.com>
Date: Wed, 7 Apr 2021 16:04:03 +0100
Subject: [PATCH] Make sure we can handle universal newlines at inference time
 and add a test for Mac-style newlines.

---
 splitgraph/ingestion/csv/common.py            | 10 ++++++-
 test/resources/ingestion/csv/mac_newlines.csv |  1 +
 test/splitgraph/ingestion/test_csv.py         | 29 +++++++++++++++++++
 3 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 test/resources/ingestion/csv/mac_newlines.csv

diff --git a/splitgraph/ingestion/csv/common.py b/splitgraph/ingestion/csv/common.py
index e82f341a..ae871f65 100644
--- a/splitgraph/ingestion/csv/common.py
+++ b/splitgraph/ingestion/csv/common.py
@@ -54,9 +54,17 @@ def autodetect_csv(stream: io.RawIOBase, csv_options: CSVOptions) -> CSVOptions:
     assert data
 
     if csv_options.autodetect_encoding:
-        csv_options = csv_options._replace(encoding=chardet.detect(data)["encoding"])
+        encoding = chardet.detect(data)["encoding"]
+        if encoding == "ascii":
+            # ASCII is a subset of UTF-8. For safety, if chardet detected
+            # the encoding as ASCII, use UTF-8 (a valid ASCII file is a valid UTF-8 file,
+            # but not vice versa)
+            encoding = "utf-8"
+        csv_options = csv_options._replace(encoding=encoding)
 
     sample = data.decode(csv_options.encoding)
+    # Emulate universal newlines mode (convert \r, \r\n, \n into \n)
+    sample = "\n".join(sample.splitlines())
 
     if csv_options.autodetect_dialect:
         dialect = csv.Sniffer().sniff(sample)
diff --git a/test/resources/ingestion/csv/mac_newlines.csv b/test/resources/ingestion/csv/mac_newlines.csv
new file mode 100644
index 00000000..784a1d3f
--- /dev/null
+++ b/test/resources/ingestion/csv/mac_newlines.csv
@@ -0,0 +1 @@
+fruit_id,timestamp,name1,2018-01-01 00:11:11,apple2,2018-01-02 00:22:22,orange3,2018-01-03 00:33:33,mayonnaise4,2018-01-04 00:44:44,mustard
\ No newline at end of file
diff --git a/test/splitgraph/ingestion/test_csv.py b/test/splitgraph/ingestion/test_csv.py
index 2420900a..dfb43d0d 100644
--- a/test/splitgraph/ingestion/test_csv.py
+++ b/test/splitgraph/ingestion/test_csv.py
@@ -190,6 +190,9 @@ def test_csv_dialect_encoding_inference():
         # TODO: we keep these in the dialect struct rather than extract back out into the
         #  CSVOptions. Might need to do the latter if we want to return the proposed FDW table
         #  params to the user.
+
+        # Note this line terminator is always "\r\n" since CSV assumes we use the
+        # universal newlines mode.
         assert options.dialect.lineterminator == "\r\n"
         assert options.dialect.delimiter == ";"
 
@@ -212,3 +215,29 @@ def test_csv_dialect_encoding_inference():
                 ordinal=3, name="TEXT", pg_type="character varying", is_pk=False, comment=None
             ),
         ]
+
+
+def test_csv_mac_newlines():
+    # Test a CSV file with old Mac-style newlines (\r)
+
+    with open(os.path.join(INGESTION_RESOURCES_CSV, "mac_newlines.csv"), "rb") as f:
+        options = CSVOptions()
+        options, reader = make_csv_reader(f, options)
+
+        assert options.encoding == "utf-8"
+        assert options.header is True
+
+        data = list(reader)
+        assert len(data) == 5
+        assert data[0] == ["fruit_id", "timestamp", "name"]
+
+        schema = generate_column_names(infer_sg_schema(data))
+        assert schema == [
+            TableColumn(ordinal=1, name="fruit_id", pg_type="integer", is_pk=False, comment=None),
+            TableColumn(
+                ordinal=2, name="timestamp", pg_type="timestamp", is_pk=False, comment=None
+            ),
+            TableColumn(
+                ordinal=3, name="name", pg_type="character varying", is_pk=False, comment=None
+            ),
+        ]