From 670d56f3207d2f8a3750d939e3fd6a7c5f9b3108 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Sun, 26 Apr 2020 11:44:03 -0700 Subject: [PATCH 1/2] Fix cases where `read_json` should fall back to pandas * Resolves #1379 * Adds cases for when we are passed objects strings directly or if we are passed some IO object. For these cases we default to pandas. * Add tests to verify behavior Signed-off-by: Devin Petersohn --- modin/engines/base/io/text/csv_reader.py | 20 +--- modin/engines/base/io/text/json_reader.py | 7 +- .../engines/base/io/text/text_file_reader.py | 18 ++++ modin/pandas/test/test_io.py | 99 +++++++++++++++++++ 4 files changed, 124 insertions(+), 20 deletions(-) diff --git a/modin/engines/base/io/text/csv_reader.py b/modin/engines/base/io/text/csv_reader.py index ceb9d2883fc..5a4c1df4ca8 100644 --- a/modin/engines/base/io/text/csv_reader.py +++ b/modin/engines/base/io/text/csv_reader.py @@ -18,24 +18,6 @@ import sys -def pathlib_or_pypath(filepath_or_buffer): - try: - import py - - if isinstance(filepath_or_buffer, py.path.local): - return True - except ImportError: # pragma: no cover - pass - try: - import pathlib - - if isinstance(filepath_or_buffer, pathlib.Path): - return True - except ImportError: # pragma: no cover - pass - return False - - class CSVReader(TextFileReader): @classmethod def read(cls, filepath_or_buffer, **kwargs): @@ -43,7 +25,7 @@ def read(cls, filepath_or_buffer, **kwargs): if not cls.file_exists(filepath_or_buffer): return cls.single_worker_read(filepath_or_buffer, **kwargs) filepath_or_buffer = cls.get_path(filepath_or_buffer) - elif not pathlib_or_pypath(filepath_or_buffer): + elif not cls.pathlib_or_pypath(filepath_or_buffer): return cls.single_worker_read(filepath_or_buffer, **kwargs) compression_type = cls.infer_compression( filepath_or_buffer, kwargs.get("compression") diff --git a/modin/engines/base/io/text/json_reader.py b/modin/engines/base/io/text/json_reader.py index 226ac801a32..da3087bebd9 100644 --- a/modin/engines/base/io/text/json_reader.py +++ b/modin/engines/base/io/text/json_reader.py @@ -21,7 +21,12 @@ class JSONReader(TextFileReader): @classmethod def read(cls, path_or_buf, **kwargs): - path_or_buf = cls.get_path(path_or_buf) + if isinstance(path_or_buf, str): + if not cls.file_exists(path_or_buf): + return cls.single_worker_read(path_or_buf, **kwargs) + path_or_buf = cls.get_path(path_or_buf) + elif not cls.pathlib_or_pypath(path_or_buf): + return cls.single_worker_read(path_or_buf, **kwargs) if not kwargs.get("lines", False): return cls.single_worker_read(path_or_buf, **kwargs) columns = pandas.read_json( diff --git a/modin/engines/base/io/text/text_file_reader.py b/modin/engines/base/io/text/text_file_reader.py index bd86388dcde..e971a40954c 100644 --- a/modin/engines/base/io/text/text_file_reader.py +++ b/modin/engines/base/io/text/text_file_reader.py @@ -52,3 +52,21 @@ def build_partition(cls, partition_ids, row_lengths, column_widths): for i in range(len(partition_ids)) ] ) + + @classmethod + def pathlib_or_pypath(cls, filepath_or_buffer): + try: + import py + + if isinstance(filepath_or_buffer, py.path.local): + return True + except ImportError: # pragma: no cover + pass + try: + import pathlib + + if isinstance(filepath_or_buffer, pathlib.Path): + return True + except ImportError: # pragma: no cover + pass + return False diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index c1c21944216..73488aa8474 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -22,6 +22,7 @@ import os import shutil import sqlalchemy as sa +from io import BytesIO from .utils import df_equals @@ -456,6 +457,104 @@ def test_from_json_lines(): teardown_json_file() +def test_from_json_string(): + json_string = """[{"project": "modin"}]""" + with pytest.warns(UserWarning): + modin_df = pd.read_json(json_string) + df_equals(modin_df, pandas.read_json(json_string)) + + json_string = """{ + "quiz": { + "sport": { + "q1": { + "question": "Which one is correct team name in NBA?", + "options": [ + "New York Bulls", + "Los Angeles Kings", + "Golden State Warriros", + "Huston Rocket" + ], + "answer": "Huston Rocket" + } + }, + "maths": { + "q1": { + "question": "5 + 7 = ?", + "options": [ + "10", + "11", + "12", + "13" + ], + "answer": "12" + }, + "q2": { + "question": "12 - 8 = ?", + "options": [ + "1", + "2", + "3", + "4" + ], + "answer": "4" + } + } + } + }""" + with pytest.warns(UserWarning): + modin_df = pd.read_json(json_string) + df_equals(modin_df, pandas.read_json(json_string)) + + +def test_from_json_bytesio(): + json_bytes = b"""[{"project": "modin"}]""" + with pytest.warns(UserWarning): + modin_df = pd.read_json(BytesIO(json_bytes)) + df_equals(modin_df, pandas.read_json(BytesIO(json_bytes))) + + json_bytes = b"""{ + "quiz": { + "sport": { + "q1": { + "question": "Which one is correct team name in NBA?", + "options": [ + "New York Bulls", + "Los Angeles Kings", + "Golden State Warriros", + "Huston Rocket" + ], + "answer": "Huston Rocket" + } + }, + "maths": { + "q1": { + "question": "5 + 7 = ?", + "options": [ + "10", + "11", + "12", + "13" + ], + "answer": "12" + }, + "q2": { + "question": "12 - 8 = ?", + "options": [ + "1", + "2", + "3", + "4" + ], + "answer": "4" + } + } + } + }""" + with pytest.warns(UserWarning): + modin_df = pd.read_json(BytesIO(json_bytes)) + df_equals(modin_df, pandas.read_json(BytesIO(json_bytes))) + + def test_from_html(): setup_html_file(SMALL_ROW_SIZE) From f0dc4213b3be544b10169da2d9b39174456792da Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 27 Apr 2020 08:51:01 -0700 Subject: [PATCH 2/2] Parametrize test Co-Authored-By: anmyachev <45976948+anmyachev@users.noreply.github.com> Add string and byte inputs to utils Signed-off-by: Devin Petersohn Update modin/pandas/test/utils.py Co-Authored-By: anmyachev <45976948+anmyachev@users.noreply.github.com> Lint Signed-off-by: Devin Petersohn Rewind file handle before reusing object Signed-off-by: Devin Petersohn --- modin/pandas/test/test_io.py | 113 +++++------------------------------ modin/pandas/test/utils.py | 43 +++++++++++++ 2 files changed, 59 insertions(+), 97 deletions(-) diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index 73488aa8474..15add96f344 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -22,9 +22,14 @@ import os import shutil import sqlalchemy as sa -from io import BytesIO -from .utils import df_equals +from .utils import ( + df_equals, + json_short_string, + json_short_bytes, + json_long_string, + json_long_bytes, +) from modin import __execution_engine__ @@ -457,102 +462,16 @@ def test_from_json_lines(): teardown_json_file() -def test_from_json_string(): - json_string = """[{"project": "modin"}]""" - with pytest.warns(UserWarning): - modin_df = pd.read_json(json_string) - df_equals(modin_df, pandas.read_json(json_string)) - - json_string = """{ - "quiz": { - "sport": { - "q1": { - "question": "Which one is correct team name in NBA?", - "options": [ - "New York Bulls", - "Los Angeles Kings", - "Golden State Warriros", - "Huston Rocket" - ], - "answer": "Huston Rocket" - } - }, - "maths": { - "q1": { - "question": "5 + 7 = ?", - "options": [ - "10", - "11", - "12", - "13" - ], - "answer": "12" - }, - "q2": { - "question": "12 - 8 = ?", - "options": [ - "1", - "2", - "3", - "4" - ], - "answer": "4" - } - } - } - }""" - with pytest.warns(UserWarning): - modin_df = pd.read_json(json_string) - df_equals(modin_df, pandas.read_json(json_string)) - - -def test_from_json_bytesio(): - json_bytes = b"""[{"project": "modin"}]""" - with pytest.warns(UserWarning): - modin_df = pd.read_json(BytesIO(json_bytes)) - df_equals(modin_df, pandas.read_json(BytesIO(json_bytes))) - - json_bytes = b"""{ - "quiz": { - "sport": { - "q1": { - "question": "Which one is correct team name in NBA?", - "options": [ - "New York Bulls", - "Los Angeles Kings", - "Golden State Warriros", - "Huston Rocket" - ], - "answer": "Huston Rocket" - } - }, - "maths": { - "q1": { - "question": "5 + 7 = ?", - "options": [ - "10", - "11", - "12", - "13" - ], - "answer": "12" - }, - "q2": { - "question": "12 - 8 = ?", - "options": [ - "1", - "2", - "3", - "4" - ], - "answer": "4" - } - } - } - }""" +@pytest.mark.parametrize( + "data", [json_short_string, json_short_bytes, json_long_string, json_long_bytes], +) +def test_read_json_string_bytes(data): with pytest.warns(UserWarning): - modin_df = pd.read_json(BytesIO(json_bytes)) - df_equals(modin_df, pandas.read_json(BytesIO(json_bytes))) + modin_df = pd.read_json(data) + # For I/O objects we need to rewind to reuse the same object. + if hasattr(data, "seek"): + data.seek(0) + df_equals(modin_df, pandas.read_json(data)) def test_from_html(): diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 1c517a6fb18..1faaf386ac3 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -16,6 +16,7 @@ from pandas.util.testing import assert_almost_equal, assert_frame_equal import modin.pandas as pd from modin.pandas.utils import to_pandas +from io import BytesIO random_state = np.random.RandomState(seed=42) @@ -300,6 +301,48 @@ # END parametrizations of common kwargs +json_short_string = """[{"project": "modin"}]""" +json_long_string = """{ + "quiz": { + "sport": { + "q1": { + "question": "Which one is correct team name in NBA?", + "options": [ + "New York Bulls", + "Los Angeles Kings", + "Golden State Warriros", + "Huston Rocket" + ], + "answer": "Huston Rocket" + } + }, + "maths": { + "q1": { + "question": "5 + 7 = ?", + "options": [ + "10", + "11", + "12", + "13" + ], + "answer": "12" + }, + "q2": { + "question": "12 - 8 = ?", + "options": [ + "1", + "2", + "3", + "4" + ], + "answer": "4" + } + } + } + }""" +json_long_bytes = BytesIO(json_long_string.encode(encoding="UTF-8")) +json_short_bytes = BytesIO(json_short_string.encode(encoding="UTF-8")) + def df_equals(df1, df2): """Tests if df1 and df2 are equal.