fix(python): Ensure read_excel and read_ods support reading from …

…raw `bytes` for all engines (#20636)
pola-rs · Jan 9, 2025 · f3da50f · f3da50f
1 parent 09687e4
commit f3da50f
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 17 deletions.
diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py
@@ -52,7 +52,9 @@ def _sources(source: FileSource) -> tuple[Any, bool]:
     read_multiple_workbooks = True
     sources: list[Any] = []
 
-    if not isinstance(source, Sequence) or isinstance(source, str):
+    if isinstance(source, memoryview):
+        source = source.tobytes()
+    if not isinstance(source, Sequence) or isinstance(source, (bytes, str)):
         read_multiple_workbooks = False
         source = [source]  # type: ignore[assignment]
 
@@ -74,7 +76,7 @@ def _standardize_duplicates(s: str) -> str:
     return re.sub(r"_duplicated_(\d+)", repl=r"\1", string=s)
 
 
-def _unpack_sheet_results(
+def _unpack_read_results(
     frames: list[pl.DataFrame] | list[dict[str, pl.DataFrame]],
     *,
     read_multiple_workbooks: bool,
@@ -394,7 +396,7 @@ def read_excel(
         )
         for src in sources
     ]
-    return _unpack_sheet_results(
+    return _unpack_read_results(
         frames=frames,
         read_multiple_workbooks=read_multiple_workbooks,
     )
@@ -606,7 +608,7 @@ def read_ods(
         )
         for src in sources
     ]
-    return _unpack_sheet_results(
+    return _unpack_read_results(
         frames=frames,
         read_multiple_workbooks=read_multiple_workbooks,
     )
@@ -807,12 +809,18 @@ def _initialise_spreadsheet_parser(
         }.items():
             engine_options.setdefault(option, value)
 
+        if isinstance(source, bytes):
+            source = BytesIO(source)
+
         parser = xlsx2csv.Xlsx2csv(source, **engine_options)
         sheets = parser.workbook.sheets
         return _read_spreadsheet_xlsx2csv, parser, sheets
 
     elif engine == "openpyxl":
         openpyxl = import_optional("openpyxl")
+        if isinstance(source, bytes):
+            source = BytesIO(source)
+
         parser = openpyxl.load_workbook(source, data_only=True, **engine_options)
         sheets = [{"index": i + 1, "name": ws.title} for i, ws in enumerate(parser)]
         return _read_spreadsheet_openpyxl, parser, sheets
@@ -830,7 +838,7 @@ def _initialise_spreadsheet_parser(
             raise ModuleUpgradeRequiredError(msg)
 
         if reading_bytesio:
-            source = source.getbuffer().tobytes()  # type: ignore[union-attr]
+            source = source.getvalue()  # type: ignore[union-attr]
         elif isinstance(source, (BufferedReader, TextIOWrapper)):
             if "b" not in source.mode:
                 msg = f"file {source.name!r} must be opened in binary mode"

diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py
@@ -823,6 +823,7 @@ def test_excel_write_column_and_row_totals(engine: ExcelSpreadsheetEngine) -> No
         # note that the totals are written as formulae, so we
         # won't have the calculated values in the dataframe
         xldf = pl.read_excel(xls, sheet_name="misc", engine=engine)
+
         assert xldf.columns == ["id", "q1", "q2", "q3", "q4", "trend", "h1", "h2"]
         assert xldf.row(-1) == (None, 0.0, 0.0, 0, 0, None, 0.0, 0)
 
@@ -836,18 +837,26 @@ def test_excel_write_compound_types(engine: ExcelSpreadsheetEngine) -> None:
     xls = BytesIO()
     df.write_excel(xls, worksheet="data")
 
-    # expect string conversion (only scalar values are supported)
-    xldf = pl.read_excel(
+    # also test reading from the various flavours of supported binary data
+    # across all backend engines (check bytesio, bytes, and memoryview)
+    for binary_data in (
         xls,
-        sheet_name="data",
-        engine=engine,
-        include_file_paths="wbook",
-    )
-    assert xldf.rows() == [
-        ("[1, 2]", "{'y': 'a', 'z': 9}", "in-mem"),
-        ("[3, 4]", "{'y': 'b', 'z': 8}", "in-mem"),
-        ("[5, 6]", "{'y': 'c', 'z': 7}", "in-mem"),
-    ]
+        xls.getvalue(),
+        xls.getbuffer(),
+    ):
+        xldf = pl.read_excel(
+            binary_data,
+            sheet_name="data",
+            engine=engine,
+            include_file_paths="wbook",
+        )
+
+        # expect string conversion (only scalar values are supported)
+        assert xldf.rows() == [
+            ("[1, 2]", "{'y': 'a', 'z': 9}", "in-mem"),
+            ("[3, 4]", "{'y': 'b', 'z': 8}", "in-mem"),
+            ("[5, 6]", "{'y': 'c', 'z': 7}", "in-mem"),
+        ]
 
 
 @pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"])
@@ -868,7 +877,7 @@ def test_excel_write_sparklines(engine: ExcelSpreadsheetEngine) -> None:
     from xlsxwriter import Workbook
 
     # note that we don't (quite) expect sparkline export to round-trip as we
-    # inject additional empty columns to hold them (which will read as nulls).
+    # inject additional empty columns to hold them (which will read as nulls)
     df = pl.DataFrame(
         {
             "id": ["aaa", "bbb", "ccc", "ddd", "eee"],