Skip to content

Commit

Permalink
Make sure we can handle universal newlines at inference time and add …
Browse files Browse the repository at this point in the history
…a test for Mac-style newlines.
  • Loading branch information
mildbyte committed Apr 7, 2021
1 parent d3e4bc0 commit 09e0f56
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 1 deletion.
10 changes: 9 additions & 1 deletion splitgraph/ingestion/csv/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,17 @@ def autodetect_csv(stream: io.RawIOBase, csv_options: CSVOptions) -> CSVOptions:
assert data

if csv_options.autodetect_encoding:
csv_options = csv_options._replace(encoding=chardet.detect(data)["encoding"])
encoding = chardet.detect(data)["encoding"]
if encoding == "ascii":
# ASCII is a subset of UTF-8. For safety, if chardet detected
# the encoding as ASCII, use UTF-8 (a valid ASCII file is a valid UTF-8 file,
# but not vice versa)
encoding = "utf-8"
csv_options = csv_options._replace(encoding=encoding)

sample = data.decode(csv_options.encoding)
# Emulate universal newlines mode (convert \r, \r\n, \n into \n)
sample = "\n".join(sample.splitlines())

if csv_options.autodetect_dialect:
dialect = csv.Sniffer().sniff(sample)
Expand Down
1 change: 1 addition & 0 deletions test/resources/ingestion/csv/mac_newlines.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fruit_id,timestamp,name1,2018-01-01 00:11:11,apple2,2018-01-02 00:22:22,orange3,2018-01-03 00:33:33,mayonnaise4,2018-01-04 00:44:44,mustard
Expand Down
29 changes: 29 additions & 0 deletions test/splitgraph/ingestion/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,9 @@ def test_csv_dialect_encoding_inference():
# TODO: we keep these in the dialect struct rather than extract back out into the
# CSVOptions. Might need to do the latter if we want to return the proposed FDW table
# params to the user.

# Note this line terminator is always "\r\n" since CSV assumes we use the
# universal newlines mode.
assert options.dialect.lineterminator == "\r\n"
assert options.dialect.delimiter == ";"

Expand All @@ -212,3 +215,29 @@ def test_csv_dialect_encoding_inference():
ordinal=3, name="TEXT", pg_type="character varying", is_pk=False, comment=None
),
]


def test_csv_mac_newlines():
# Test a CSV file with old Mac-style newlines (\r)

with open(os.path.join(INGESTION_RESOURCES_CSV, "mac_newlines.csv"), "rb") as f:
options = CSVOptions()
options, reader = make_csv_reader(f, options)

assert options.encoding == "utf-8"
assert options.header is True

data = list(reader)
assert len(data) == 5
assert data[0] == ["fruit_id", "timestamp", "name"]

schema = generate_column_names(infer_sg_schema(data))
assert schema == [
TableColumn(ordinal=1, name="fruit_id", pg_type="integer", is_pk=False, comment=None),
TableColumn(
ordinal=2, name="timestamp", pg_type="timestamp", is_pk=False, comment=None
),
TableColumn(
ordinal=3, name="name", pg_type="character varying", is_pk=False, comment=None
),
]

0 comments on commit 09e0f56

Please sign in to comment.