diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index 7d47cad80a69..1f79c824e46e 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -52,7 +52,9 @@ def _sources(source: FileSource) -> tuple[Any, bool]: read_multiple_workbooks = True sources: list[Any] = [] - if not isinstance(source, Sequence) or isinstance(source, str): + if isinstance(source, memoryview): + source = source.tobytes() + if not isinstance(source, Sequence) or isinstance(source, (bytes, str)): read_multiple_workbooks = False source = [source] # type: ignore[assignment] @@ -74,7 +76,7 @@ def _standardize_duplicates(s: str) -> str: return re.sub(r"_duplicated_(\d+)", repl=r"\1", string=s) -def _unpack_sheet_results( +def _unpack_read_results( frames: list[pl.DataFrame] | list[dict[str, pl.DataFrame]], *, read_multiple_workbooks: bool, @@ -394,7 +396,7 @@ def read_excel( ) for src in sources ] - return _unpack_sheet_results( + return _unpack_read_results( frames=frames, read_multiple_workbooks=read_multiple_workbooks, ) @@ -606,7 +608,7 @@ def read_ods( ) for src in sources ] - return _unpack_sheet_results( + return _unpack_read_results( frames=frames, read_multiple_workbooks=read_multiple_workbooks, ) @@ -807,12 +809,18 @@ def _initialise_spreadsheet_parser( }.items(): engine_options.setdefault(option, value) + if isinstance(source, bytes): + source = BytesIO(source) + parser = xlsx2csv.Xlsx2csv(source, **engine_options) sheets = parser.workbook.sheets return _read_spreadsheet_xlsx2csv, parser, sheets elif engine == "openpyxl": openpyxl = import_optional("openpyxl") + if isinstance(source, bytes): + source = BytesIO(source) + parser = openpyxl.load_workbook(source, data_only=True, **engine_options) sheets = [{"index": i + 1, "name": ws.title} for i, ws in enumerate(parser)] return _read_spreadsheet_openpyxl, parser, sheets @@ -830,7 +838,7 @@ def _initialise_spreadsheet_parser( raise ModuleUpgradeRequiredError(msg) if reading_bytesio: - source = source.getbuffer().tobytes() # type: ignore[union-attr] + source = source.getvalue() # type: ignore[union-attr] elif isinstance(source, (BufferedReader, TextIOWrapper)): if "b" not in source.mode: msg = f"file {source.name!r} must be opened in binary mode" diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index a217f5b9dee5..c1838351d86d 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -823,6 +823,7 @@ def test_excel_write_column_and_row_totals(engine: ExcelSpreadsheetEngine) -> No # note that the totals are written as formulae, so we # won't have the calculated values in the dataframe xldf = pl.read_excel(xls, sheet_name="misc", engine=engine) + assert xldf.columns == ["id", "q1", "q2", "q3", "q4", "trend", "h1", "h2"] assert xldf.row(-1) == (None, 0.0, 0.0, 0, 0, None, 0.0, 0) @@ -836,18 +837,26 @@ def test_excel_write_compound_types(engine: ExcelSpreadsheetEngine) -> None: xls = BytesIO() df.write_excel(xls, worksheet="data") - # expect string conversion (only scalar values are supported) - xldf = pl.read_excel( + # also test reading from the various flavours of supported binary data + # across all backend engines (check bytesio, bytes, and memoryview) + for binary_data in ( xls, - sheet_name="data", - engine=engine, - include_file_paths="wbook", - ) - assert xldf.rows() == [ - ("[1, 2]", "{'y': 'a', 'z': 9}", "in-mem"), - ("[3, 4]", "{'y': 'b', 'z': 8}", "in-mem"), - ("[5, 6]", "{'y': 'c', 'z': 7}", "in-mem"), - ] + xls.getvalue(), + xls.getbuffer(), + ): + xldf = pl.read_excel( + binary_data, + sheet_name="data", + engine=engine, + include_file_paths="wbook", + ) + + # expect string conversion (only scalar values are supported) + assert xldf.rows() == [ + ("[1, 2]", "{'y': 'a', 'z': 9}", "in-mem"), + ("[3, 4]", "{'y': 'b', 'z': 8}", "in-mem"), + ("[5, 6]", "{'y': 'c', 'z': 7}", "in-mem"), + ] @pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"]) @@ -868,7 +877,7 @@ def test_excel_write_sparklines(engine: ExcelSpreadsheetEngine) -> None: from xlsxwriter import Workbook # note that we don't (quite) expect sparkline export to round-trip as we - # inject additional empty columns to hold them (which will read as nulls). + # inject additional empty columns to hold them (which will read as nulls) df = pl.DataFrame( { "id": ["aaa", "bbb", "ccc", "ddd", "eee"],