From ca99d5adeb794b35bca91fe76587b44c6d13db12 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 15 May 2024 21:18:15 -0600 Subject: [PATCH] Test fsspec roundtrip (#42) * move kerchunk backend imports to be specific to each backend filetype * test roundtrip to json file then reading using fsspec * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add test env dependencies * more test env deps * more * add pip install of xarray PR * correct pip url * roundtrip test involving concatenation * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove duplication of pooch * correct formatting * try removing netcdf4-python from the environment --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- pyproject.toml | 6 +-- virtualizarr/tests/test_integration.py | 60 ++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 16e8486a..8338279c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,8 +28,7 @@ dependencies = [ "numpy", "ujson", "packaging", - "universal-pathlib" - + "universal-pathlib", ] [project.optional-dependencies] @@ -39,8 +38,9 @@ test = [ "pytest-mypy", "pytest-cov", "pytest", - "scipy", + "fsspec", "pooch", + "scipy", "ruff", "fastparquet", "s3fs" diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 578bfab1..3d199b73 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -1,3 +1,4 @@ +import fsspec import pytest import xarray as xr import xarray.testing as xrt @@ -5,6 +6,65 @@ from virtualizarr import open_virtual_dataset +def test_kerchunk_roundtrip_no_concat(tmpdir): + # set up example xarray dataset + ds = xr.tutorial.open_dataset("air_temperature", decode_times=False) + + # save it to disk as netCDF (in temporary directory) + ds.to_netcdf(f"{tmpdir}/air.nc") + + # use open_dataset_via_kerchunk to read it as references + vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={}) + + # write those references to disk as kerchunk json + vds.virtualize.to_kerchunk(f"{tmpdir}/refs.json", format="json") + + # use fsspec to read the dataset from disk via the zarr store + fs = fsspec.filesystem("reference", fo=f"{tmpdir}/refs.json") + m = fs.get_mapper("") + + roundtrip = xr.open_dataset(m, engine="kerchunk") + + # assert equal to original dataset + xrt.assert_equal(roundtrip, ds) + + +def test_kerchunk_roundtrip_concat(tmpdir): + # set up example xarray dataset + ds = xr.tutorial.open_dataset("air_temperature", decode_times=False).isel( + time=slice(None, 2000) + ) + + # split into two datasets + ds1, ds2 = ds.isel(time=slice(None, 1000)), ds.isel(time=slice(1000, None)) + + # save it to disk as netCDF (in temporary directory) + ds1.to_netcdf(f"{tmpdir}/air1.nc") + ds2.to_netcdf(f"{tmpdir}/air2.nc") + + # use open_dataset_via_kerchunk to read it as references + vds1 = open_virtual_dataset(f"{tmpdir}/air1.nc", indexes={}) + vds2 = open_virtual_dataset(f"{tmpdir}/air2.nc", indexes={}) + + # concatenate virtually along time + vds = xr.concat([vds1, vds2], dim="time", coords="minimal", compat="override") + print(vds["air"].variable._data) + + # write those references to disk as kerchunk json + vds.virtualize.to_kerchunk(f"{tmpdir}/refs.json", format="json") + + # use fsspec to read the dataset from disk via the zarr store + fs = fsspec.filesystem("reference", fo=f"{tmpdir}/refs.json") + m = fs.get_mapper("") + + roundtrip = xr.open_dataset(m, engine="kerchunk") + + # user does analysis here + + # assert equal to original dataset + xrt.assert_equal(roundtrip, ds) + + def test_open_scalar_variable(tmpdir): # regression test for GH issue #100