Make sure we can handle universal newlines at inference time and add …

…a test for Mac-style newlines.
splitgraph · Apr 7, 2021 · 09e0f56 · 09e0f56
1 parent d3e4bc0
commit 09e0f56
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 1 deletion.
diff --git a/splitgraph/ingestion/csv/common.py b/splitgraph/ingestion/csv/common.py
@@ -54,9 +54,17 @@ def autodetect_csv(stream: io.RawIOBase, csv_options: CSVOptions) -> CSVOptions:
     assert data
 
     if csv_options.autodetect_encoding:
-        csv_options = csv_options._replace(encoding=chardet.detect(data)["encoding"])
+        encoding = chardet.detect(data)["encoding"]
+        if encoding == "ascii":
+            # ASCII is a subset of UTF-8. For safety, if chardet detected
+            # the encoding as ASCII, use UTF-8 (a valid ASCII file is a valid UTF-8 file,
+            # but not vice versa)
+            encoding = "utf-8"
+        csv_options = csv_options._replace(encoding=encoding)
 
     sample = data.decode(csv_options.encoding)
+    # Emulate universal newlines mode (convert \r, \r\n, \n into \n)
+    sample = "\n".join(sample.splitlines())
 
     if csv_options.autodetect_dialect:
         dialect = csv.Sniffer().sniff(sample)

diff --git a/test/resources/ingestion/csv/mac_newlines.csv b/test/resources/ingestion/csv/mac_newlines.csv
@@ -0,0 +1 @@
+fruit_id,timestamp,name1,2018-01-01 00:11:11,apple2,2018-01-02 00:22:22,orange3,2018-01-03 00:33:33,mayonnaise4,2018-01-04 00:44:44,mustard

diff --git a/test/splitgraph/ingestion/test_csv.py b/test/splitgraph/ingestion/test_csv.py
@@ -190,6 +190,9 @@ def test_csv_dialect_encoding_inference():
         # TODO: we keep these in the dialect struct rather than extract back out into the
         #  CSVOptions. Might need to do the latter if we want to return the proposed FDW table
         #  params to the user.
+
+        # Note this line terminator is always "\r\n" since CSV assumes we use the
+        # universal newlines mode.
         assert options.dialect.lineterminator == "\r\n"
         assert options.dialect.delimiter == ";"
 
@@ -212,3 +215,29 @@ def test_csv_dialect_encoding_inference():
                 ordinal=3, name="TEXT", pg_type="character varying", is_pk=False, comment=None
             ),
         ]
+
+
+def test_csv_mac_newlines():
+    # Test a CSV file with old Mac-style newlines (\r)
+
+    with open(os.path.join(INGESTION_RESOURCES_CSV, "mac_newlines.csv"), "rb") as f:
+        options = CSVOptions()
+        options, reader = make_csv_reader(f, options)
+
+        assert options.encoding == "utf-8"
+        assert options.header is True
+
+        data = list(reader)
+        assert len(data) == 5
+        assert data[0] == ["fruit_id", "timestamp", "name"]
+
+        schema = generate_column_names(infer_sg_schema(data))
+        assert schema == [
+            TableColumn(ordinal=1, name="fruit_id", pg_type="integer", is_pk=False, comment=None),
+            TableColumn(
+                ordinal=2, name="timestamp", pg_type="timestamp", is_pk=False, comment=None
+            ),
+            TableColumn(
+                ordinal=3, name="name", pg_type="character varying", is_pk=False, comment=None
+            ),
+        ]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		fruit_id,timestamp,name1,2018-01-01 00:11:11,apple2,2018-01-02 00:22:22,orange3,2018-01-03 00:33:33,mayonnaise4,2018-01-04 00:44:44,mustard
Expand Down