From 670d56f3207d2f8a3750d939e3fd6a7c5f9b3108 Mon Sep 17 00:00:00 2001
From: Devin Petersohn <devin.petersohn@gmail.com>
Date: Sun, 26 Apr 2020 11:44:03 -0700
Subject: [PATCH 1/2] Fix cases where `read_json` should fall back to pandas

* Resolves #1379
* Adds cases for when we are passed objects strings directly or if we
  are passed some IO object. For these cases we default to pandas.
* Add tests to verify behavior

Signed-off-by: Devin Petersohn <devin.petersohn@gmail.com>
---
 modin/engines/base/io/text/csv_reader.py      | 20 +---
 modin/engines/base/io/text/json_reader.py     |  7 +-
 .../engines/base/io/text/text_file_reader.py  | 18 ++++
 modin/pandas/test/test_io.py                  | 99 +++++++++++++++++++
 4 files changed, 124 insertions(+), 20 deletions(-)

diff --git a/modin/engines/base/io/text/csv_reader.py b/modin/engines/base/io/text/csv_reader.py
index ceb9d2883fc..5a4c1df4ca8 100644
--- a/modin/engines/base/io/text/csv_reader.py
+++ b/modin/engines/base/io/text/csv_reader.py
@@ -18,24 +18,6 @@
 import sys
 
 
-def pathlib_or_pypath(filepath_or_buffer):
-    try:
-        import py
-
-        if isinstance(filepath_or_buffer, py.path.local):
-            return True
-    except ImportError:  # pragma: no cover
-        pass
-    try:
-        import pathlib
-
-        if isinstance(filepath_or_buffer, pathlib.Path):
-            return True
-    except ImportError:  # pragma: no cover
-        pass
-    return False
-
-
 class CSVReader(TextFileReader):
     @classmethod
     def read(cls, filepath_or_buffer, **kwargs):
@@ -43,7 +25,7 @@ def read(cls, filepath_or_buffer, **kwargs):
             if not cls.file_exists(filepath_or_buffer):
                 return cls.single_worker_read(filepath_or_buffer, **kwargs)
             filepath_or_buffer = cls.get_path(filepath_or_buffer)
-        elif not pathlib_or_pypath(filepath_or_buffer):
+        elif not cls.pathlib_or_pypath(filepath_or_buffer):
             return cls.single_worker_read(filepath_or_buffer, **kwargs)
         compression_type = cls.infer_compression(
             filepath_or_buffer, kwargs.get("compression")
diff --git a/modin/engines/base/io/text/json_reader.py b/modin/engines/base/io/text/json_reader.py
index 226ac801a32..da3087bebd9 100644
--- a/modin/engines/base/io/text/json_reader.py
+++ b/modin/engines/base/io/text/json_reader.py
@@ -21,7 +21,12 @@
 class JSONReader(TextFileReader):
     @classmethod
     def read(cls, path_or_buf, **kwargs):
-        path_or_buf = cls.get_path(path_or_buf)
+        if isinstance(path_or_buf, str):
+            if not cls.file_exists(path_or_buf):
+                return cls.single_worker_read(path_or_buf, **kwargs)
+            path_or_buf = cls.get_path(path_or_buf)
+        elif not cls.pathlib_or_pypath(path_or_buf):
+            return cls.single_worker_read(path_or_buf, **kwargs)
         if not kwargs.get("lines", False):
             return cls.single_worker_read(path_or_buf, **kwargs)
         columns = pandas.read_json(
diff --git a/modin/engines/base/io/text/text_file_reader.py b/modin/engines/base/io/text/text_file_reader.py
index bd86388dcde..e971a40954c 100644
--- a/modin/engines/base/io/text/text_file_reader.py
+++ b/modin/engines/base/io/text/text_file_reader.py
@@ -52,3 +52,21 @@ def build_partition(cls, partition_ids, row_lengths, column_widths):
                 for i in range(len(partition_ids))
             ]
         )
+
+    @classmethod
+    def pathlib_or_pypath(cls, filepath_or_buffer):
+        try:
+            import py
+
+            if isinstance(filepath_or_buffer, py.path.local):
+                return True
+        except ImportError:  # pragma: no cover
+            pass
+        try:
+            import pathlib
+
+            if isinstance(filepath_or_buffer, pathlib.Path):
+                return True
+        except ImportError:  # pragma: no cover
+            pass
+        return False
diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py
index c1c21944216..73488aa8474 100644
--- a/modin/pandas/test/test_io.py
+++ b/modin/pandas/test/test_io.py
@@ -22,6 +22,7 @@
 import os
 import shutil
 import sqlalchemy as sa
+from io import BytesIO
 
 from .utils import df_equals
 
@@ -456,6 +457,104 @@ def test_from_json_lines():
     teardown_json_file()
 
 
+def test_from_json_string():
+    json_string = """[{"project": "modin"}]"""
+    with pytest.warns(UserWarning):
+        modin_df = pd.read_json(json_string)
+    df_equals(modin_df, pandas.read_json(json_string))
+
+    json_string = """{
+        "quiz": {
+            "sport": {
+                "q1": {
+                    "question": "Which one is correct team name in NBA?",
+                    "options": [
+                        "New York Bulls",
+                        "Los Angeles Kings",
+                        "Golden State Warriros",
+                        "Huston Rocket"
+                    ],
+                    "answer": "Huston Rocket"
+                }
+            },
+            "maths": {
+                "q1": {
+                    "question": "5 + 7 = ?",
+                    "options": [
+                        "10",
+                        "11",
+                        "12",
+                        "13"
+                    ],
+                    "answer": "12"
+                },
+                "q2": {
+                    "question": "12 - 8 = ?",
+                    "options": [
+                        "1",
+                        "2",
+                        "3",
+                        "4"
+                    ],
+                    "answer": "4"
+                }
+            }
+        }
+    }"""
+    with pytest.warns(UserWarning):
+        modin_df = pd.read_json(json_string)
+    df_equals(modin_df, pandas.read_json(json_string))
+
+
+def test_from_json_bytesio():
+    json_bytes = b"""[{"project": "modin"}]"""
+    with pytest.warns(UserWarning):
+        modin_df = pd.read_json(BytesIO(json_bytes))
+    df_equals(modin_df, pandas.read_json(BytesIO(json_bytes)))
+
+    json_bytes = b"""{
+            "quiz": {
+                "sport": {
+                    "q1": {
+                        "question": "Which one is correct team name in NBA?",
+                        "options": [
+                            "New York Bulls",
+                            "Los Angeles Kings",
+                            "Golden State Warriros",
+                            "Huston Rocket"
+                        ],
+                        "answer": "Huston Rocket"
+                    }
+                },
+                "maths": {
+                    "q1": {
+                        "question": "5 + 7 = ?",
+                        "options": [
+                            "10",
+                            "11",
+                            "12",
+                            "13"
+                        ],
+                        "answer": "12"
+                    },
+                    "q2": {
+                        "question": "12 - 8 = ?",
+                        "options": [
+                            "1",
+                            "2",
+                            "3",
+                            "4"
+                        ],
+                        "answer": "4"
+                    }
+                }
+            }
+        }"""
+    with pytest.warns(UserWarning):
+        modin_df = pd.read_json(BytesIO(json_bytes))
+    df_equals(modin_df, pandas.read_json(BytesIO(json_bytes)))
+
+
 def test_from_html():
     setup_html_file(SMALL_ROW_SIZE)
 

From f0dc4213b3be544b10169da2d9b39174456792da Mon Sep 17 00:00:00 2001
From: Devin Petersohn <devin-petersohn@users.noreply.github.com>
Date: Mon, 27 Apr 2020 08:51:01 -0700
Subject: [PATCH 2/2] Parametrize test

Co-Authored-By: anmyachev <45976948+anmyachev@users.noreply.github.com>

Add string and byte inputs to utils

Signed-off-by: Devin Petersohn <devin.petersohn@gmail.com>

Update modin/pandas/test/utils.py

Co-Authored-By: anmyachev <45976948+anmyachev@users.noreply.github.com>

Lint

Signed-off-by: Devin Petersohn <devin.petersohn@gmail.com>

Rewind file handle before reusing object

Signed-off-by: Devin Petersohn <devin.petersohn@gmail.com>
---
 modin/pandas/test/test_io.py | 113 +++++------------------------------
 modin/pandas/test/utils.py   |  43 +++++++++++++
 2 files changed, 59 insertions(+), 97 deletions(-)

diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py
index 73488aa8474..15add96f344 100644
--- a/modin/pandas/test/test_io.py
+++ b/modin/pandas/test/test_io.py
@@ -22,9 +22,14 @@
 import os
 import shutil
 import sqlalchemy as sa
-from io import BytesIO
 
-from .utils import df_equals
+from .utils import (
+    df_equals,
+    json_short_string,
+    json_short_bytes,
+    json_long_string,
+    json_long_bytes,
+)
 
 from modin import __execution_engine__
 
@@ -457,102 +462,16 @@ def test_from_json_lines():
     teardown_json_file()
 
 
-def test_from_json_string():
-    json_string = """[{"project": "modin"}]"""
-    with pytest.warns(UserWarning):
-        modin_df = pd.read_json(json_string)
-    df_equals(modin_df, pandas.read_json(json_string))
-
-    json_string = """{
-        "quiz": {
-            "sport": {
-                "q1": {
-                    "question": "Which one is correct team name in NBA?",
-                    "options": [
-                        "New York Bulls",
-                        "Los Angeles Kings",
-                        "Golden State Warriros",
-                        "Huston Rocket"
-                    ],
-                    "answer": "Huston Rocket"
-                }
-            },
-            "maths": {
-                "q1": {
-                    "question": "5 + 7 = ?",
-                    "options": [
-                        "10",
-                        "11",
-                        "12",
-                        "13"
-                    ],
-                    "answer": "12"
-                },
-                "q2": {
-                    "question": "12 - 8 = ?",
-                    "options": [
-                        "1",
-                        "2",
-                        "3",
-                        "4"
-                    ],
-                    "answer": "4"
-                }
-            }
-        }
-    }"""
-    with pytest.warns(UserWarning):
-        modin_df = pd.read_json(json_string)
-    df_equals(modin_df, pandas.read_json(json_string))
-
-
-def test_from_json_bytesio():
-    json_bytes = b"""[{"project": "modin"}]"""
-    with pytest.warns(UserWarning):
-        modin_df = pd.read_json(BytesIO(json_bytes))
-    df_equals(modin_df, pandas.read_json(BytesIO(json_bytes)))
-
-    json_bytes = b"""{
-            "quiz": {
-                "sport": {
-                    "q1": {
-                        "question": "Which one is correct team name in NBA?",
-                        "options": [
-                            "New York Bulls",
-                            "Los Angeles Kings",
-                            "Golden State Warriros",
-                            "Huston Rocket"
-                        ],
-                        "answer": "Huston Rocket"
-                    }
-                },
-                "maths": {
-                    "q1": {
-                        "question": "5 + 7 = ?",
-                        "options": [
-                            "10",
-                            "11",
-                            "12",
-                            "13"
-                        ],
-                        "answer": "12"
-                    },
-                    "q2": {
-                        "question": "12 - 8 = ?",
-                        "options": [
-                            "1",
-                            "2",
-                            "3",
-                            "4"
-                        ],
-                        "answer": "4"
-                    }
-                }
-            }
-        }"""
+@pytest.mark.parametrize(
+    "data", [json_short_string, json_short_bytes, json_long_string, json_long_bytes],
+)
+def test_read_json_string_bytes(data):
     with pytest.warns(UserWarning):
-        modin_df = pd.read_json(BytesIO(json_bytes))
-    df_equals(modin_df, pandas.read_json(BytesIO(json_bytes)))
+        modin_df = pd.read_json(data)
+    # For I/O objects we need to rewind to reuse the same object.
+    if hasattr(data, "seek"):
+        data.seek(0)
+    df_equals(modin_df, pandas.read_json(data))
 
 
 def test_from_html():
diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py
index 1c517a6fb18..1faaf386ac3 100644
--- a/modin/pandas/test/utils.py
+++ b/modin/pandas/test/utils.py
@@ -16,6 +16,7 @@
 from pandas.util.testing import assert_almost_equal, assert_frame_equal
 import modin.pandas as pd
 from modin.pandas.utils import to_pandas
+from io import BytesIO
 
 random_state = np.random.RandomState(seed=42)
 
@@ -300,6 +301,48 @@
 
 # END parametrizations of common kwargs
 
+json_short_string = """[{"project": "modin"}]"""
+json_long_string = """{
+        "quiz": {
+            "sport": {
+                "q1": {
+                    "question": "Which one is correct team name in NBA?",
+                    "options": [
+                        "New York Bulls",
+                        "Los Angeles Kings",
+                        "Golden State Warriros",
+                        "Huston Rocket"
+                    ],
+                    "answer": "Huston Rocket"
+                }
+            },
+            "maths": {
+                "q1": {
+                    "question": "5 + 7 = ?",
+                    "options": [
+                        "10",
+                        "11",
+                        "12",
+                        "13"
+                    ],
+                    "answer": "12"
+                },
+                "q2": {
+                    "question": "12 - 8 = ?",
+                    "options": [
+                        "1",
+                        "2",
+                        "3",
+                        "4"
+                    ],
+                    "answer": "4"
+                }
+            }
+        }
+    }"""
+json_long_bytes = BytesIO(json_long_string.encode(encoding="UTF-8"))
+json_short_bytes = BytesIO(json_short_string.encode(encoding="UTF-8"))
+
 
 def df_equals(df1, df2):
     """Tests if df1 and df2 are equal.