Skip to content

Commit

Permalink
fix(python): Ensure read_excel and read_ods support reading from …
Browse files Browse the repository at this point in the history
…raw `bytes` for all engines (#20636)
  • Loading branch information
alexander-beedie authored Jan 9, 2025
1 parent 09687e4 commit f3da50f
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 17 deletions.
18 changes: 13 additions & 5 deletions py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ def _sources(source: FileSource) -> tuple[Any, bool]:
read_multiple_workbooks = True
sources: list[Any] = []

if not isinstance(source, Sequence) or isinstance(source, str):
if isinstance(source, memoryview):
source = source.tobytes()
if not isinstance(source, Sequence) or isinstance(source, (bytes, str)):
read_multiple_workbooks = False
source = [source] # type: ignore[assignment]

Expand All @@ -74,7 +76,7 @@ def _standardize_duplicates(s: str) -> str:
return re.sub(r"_duplicated_(\d+)", repl=r"\1", string=s)


def _unpack_sheet_results(
def _unpack_read_results(
frames: list[pl.DataFrame] | list[dict[str, pl.DataFrame]],
*,
read_multiple_workbooks: bool,
Expand Down Expand Up @@ -394,7 +396,7 @@ def read_excel(
)
for src in sources
]
return _unpack_sheet_results(
return _unpack_read_results(
frames=frames,
read_multiple_workbooks=read_multiple_workbooks,
)
Expand Down Expand Up @@ -606,7 +608,7 @@ def read_ods(
)
for src in sources
]
return _unpack_sheet_results(
return _unpack_read_results(
frames=frames,
read_multiple_workbooks=read_multiple_workbooks,
)
Expand Down Expand Up @@ -807,12 +809,18 @@ def _initialise_spreadsheet_parser(
}.items():
engine_options.setdefault(option, value)

if isinstance(source, bytes):
source = BytesIO(source)

parser = xlsx2csv.Xlsx2csv(source, **engine_options)
sheets = parser.workbook.sheets
return _read_spreadsheet_xlsx2csv, parser, sheets

elif engine == "openpyxl":
openpyxl = import_optional("openpyxl")
if isinstance(source, bytes):
source = BytesIO(source)

parser = openpyxl.load_workbook(source, data_only=True, **engine_options)
sheets = [{"index": i + 1, "name": ws.title} for i, ws in enumerate(parser)]
return _read_spreadsheet_openpyxl, parser, sheets
Expand All @@ -830,7 +838,7 @@ def _initialise_spreadsheet_parser(
raise ModuleUpgradeRequiredError(msg)

if reading_bytesio:
source = source.getbuffer().tobytes() # type: ignore[union-attr]
source = source.getvalue() # type: ignore[union-attr]
elif isinstance(source, (BufferedReader, TextIOWrapper)):
if "b" not in source.mode:
msg = f"file {source.name!r} must be opened in binary mode"
Expand Down
33 changes: 21 additions & 12 deletions py-polars/tests/unit/io/test_spreadsheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -823,6 +823,7 @@ def test_excel_write_column_and_row_totals(engine: ExcelSpreadsheetEngine) -> No
# note that the totals are written as formulae, so we
# won't have the calculated values in the dataframe
xldf = pl.read_excel(xls, sheet_name="misc", engine=engine)

assert xldf.columns == ["id", "q1", "q2", "q3", "q4", "trend", "h1", "h2"]
assert xldf.row(-1) == (None, 0.0, 0.0, 0, 0, None, 0.0, 0)

Expand All @@ -836,18 +837,26 @@ def test_excel_write_compound_types(engine: ExcelSpreadsheetEngine) -> None:
xls = BytesIO()
df.write_excel(xls, worksheet="data")

# expect string conversion (only scalar values are supported)
xldf = pl.read_excel(
# also test reading from the various flavours of supported binary data
# across all backend engines (check bytesio, bytes, and memoryview)
for binary_data in (
xls,
sheet_name="data",
engine=engine,
include_file_paths="wbook",
)
assert xldf.rows() == [
("[1, 2]", "{'y': 'a', 'z': 9}", "in-mem"),
("[3, 4]", "{'y': 'b', 'z': 8}", "in-mem"),
("[5, 6]", "{'y': 'c', 'z': 7}", "in-mem"),
]
xls.getvalue(),
xls.getbuffer(),
):
xldf = pl.read_excel(
binary_data,
sheet_name="data",
engine=engine,
include_file_paths="wbook",
)

# expect string conversion (only scalar values are supported)
assert xldf.rows() == [
("[1, 2]", "{'y': 'a', 'z': 9}", "in-mem"),
("[3, 4]", "{'y': 'b', 'z': 8}", "in-mem"),
("[5, 6]", "{'y': 'c', 'z': 7}", "in-mem"),
]


@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"])
Expand All @@ -868,7 +877,7 @@ def test_excel_write_sparklines(engine: ExcelSpreadsheetEngine) -> None:
from xlsxwriter import Workbook

# note that we don't (quite) expect sparkline export to round-trip as we
# inject additional empty columns to hold them (which will read as nulls).
# inject additional empty columns to hold them (which will read as nulls)
df = pl.DataFrame(
{
"id": ["aaa", "bbb", "ccc", "ddd", "eee"],
Expand Down

0 comments on commit f3da50f

Please sign in to comment.