earth-mover · TomNicholas · Sep 27, 2024 · Sep 27, 2024 · Sep 27, 2024 · Sep 27, 2024
diff --git a/conftest.py b/conftest.py
@@ -1,6 +1,8 @@
 import h5py
+import numpy as np
 import pytest
 import xarray as xr
+from xarray.core.variable import Variable
 
 
 def pytest_addoption(parser):
@@ -82,3 +84,16 @@ def hdf5_scalar(tmpdir):
     dataset = f.create_dataset("scalar", data=0.1, dtype="float32")
     dataset.attrs["scalar"] = "true"
     return filepath
+
+
+@pytest.fixture
+def simple_netcdf4(tmpdir):
+    filepath = f"{tmpdir}/simple.nc"
+
+    arr = np.arange(12, dtype=np.dtype("int32")).reshape(3, 4)
+    var = Variable(data=arr, dims=["x", "y"])
+    ds = xr.Dataset({"foo": var})
+
+    ds.to_netcdf(filepath)
+
+    return filepath
diff --git a/virtualizarr/tests/test_writers/conftest.py b/virtualizarr/tests/test_writers/conftest.py
@@ -0,0 +1,27 @@
+import numpy as np
+import pytest
+from xarray import Dataset
+from xarray.core.variable import Variable
+
+from virtualizarr.manifests import ChunkManifest, ManifestArray
+
+
+@pytest.fixture
+def vds_with_manifest_arrays() -> Dataset:
+    arr = ManifestArray(
+        chunkmanifest=ChunkManifest(
+            entries={"0.0": dict(path="/test.nc", offset=6144, length=48)}
+        ),
+        zarray=dict(
+            shape=(2, 3),
+            dtype=np.dtype("<i8"),
+            chunks=(2, 3),
+            compressor={"id": "zlib", "level": 1},
+            filters=None,
+            fill_value=0,
+            order="C",
+            zarr_format=3,
+        ),
+    )
+    var = Variable(dims=["x", "y"], data=arr, attrs={"units": "km"})
+    return Dataset({"a": var}, attrs={"something": 0})
diff --git a/virtualizarr/tests/test_writers/test_icechunk.py b/virtualizarr/tests/test_writers/test_icechunk.py
@@ -0,0 +1,181 @@
+import asyncio
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import pytest
+
+pytest.importorskip("icechunk")
+
+import numpy as np
+import numpy.testing as npt
+from xarray import Dataset, open_dataset, open_zarr
+from xarray.core.variable import Variable
+from zarr import Array, Group, group
+
+from virtualizarr.manifests import ChunkManifest, ManifestArray
+from virtualizarr.writers.icechunk import dataset_to_icechunk
+from virtualizarr.zarr import ZArray
+
+if TYPE_CHECKING:
+    from icechunk import IcechunkStore
+
+
+@pytest.fixture
+def icechunk_filestore(tmpdir) -> "IcechunkStore":
+    from icechunk import IcechunkStore, StorageConfig
+
+    storage = StorageConfig.filesystem(str(tmpdir))
+
+    # TODO if icechunk exposed a synchronous version of .open then we wouldn't need to use asyncio.run here
+    # TODO is this the correct mode to use?
+    store = asyncio.run(IcechunkStore.open(storage=storage, mode="r+"))
+
+    # TODO instead yield store then store.close() ??
+    return store
+
+
+class TestWriteVirtualRefs:
+    def test_write_new_virtual_variable(
+        self, icechunk_filestore: "IcechunkStore", vds_with_manifest_arrays: Dataset
+    ):
+        vds = vds_with_manifest_arrays
+
+        dataset_to_icechunk(vds, icechunk_filestore)
+
+        # check attrs
+        root_group = group(store=icechunk_filestore)
+        assert isinstance(root_group, Group)
+        assert root_group.attrs == {"something": 0}
+
+        # TODO check against vds, then perhaps parametrize?
+
+        # check array exists
+        assert "a" in root_group
+        arr = root_group["a"]
+        assert isinstance(arr, Array)
+
+        # check array metadata
+        # TODO why doesn't a .zarr_format or .version attribute exist on zarr.Array?
+        # assert arr.zarr_format == 3
+        assert arr.shape == (2, 3)
+        assert arr.chunks == (2, 3)
+        assert arr.dtype == np.dtype("<i8")
+        assert arr.order == "C"
+        assert arr.fill_value == 0
+        # TODO check compressor, filters?
+        #
+
+        # check array attrs
+        # TODO somehow this is broken by setting the dimension names???
+        # assert dict(arr.attrs) == {"units": "km"}
+
+        # check dimensions
+        assert arr.attrs["_ARRAY_DIMENSIONS"] == ["x", "y"]
+
+    def test_set_single_virtual_ref_without_encoding(
+        self, icechunk_filestore: "IcechunkStore", simple_netcdf4: Path
+    ):
+        # TODO kerchunk doesn't work with zarr-python v3 yet so we can't use open_virtual_dataset and icechunk together!
+        # vds = open_virtual_dataset(netcdf4_file, indexes={})
+
+        # instead for now just write out byte ranges explicitly
+        manifest = ChunkManifest(
+            {"0.0": {"path": simple_netcdf4, "offset": 6144, "length": 48}}
+        )
+        zarray = ZArray(
+            shape=(3, 4),
+            chunks=(3, 4),
+            dtype=np.dtype("int32"),
+            compressor=None,
+            filters=None,
+            fill_value=None,
+        )
+        ma = ManifestArray(
+            chunkmanifest=manifest,
+            zarray=zarray,
+        )
+        foo = Variable(data=ma, dims=["x", "y"])
+        vds = Dataset(
+            {"foo": foo},
+        )
+
+        dataset_to_icechunk(vds, icechunk_filestore)
+
+        root_group = group(store=icechunk_filestore)
+        array = root_group["foo"]
+
+        # check chunk references
+        # TODO we can't explicitly check that the path/offset/length is correct because icechunk doesn't yet expose any get_virtual_refs method
+
+        expected_ds = open_dataset(simple_netcdf4)
+        expected_array = expected_ds["foo"].to_numpy()
+        npt.assert_equal(array, expected_array)
+
+        #ds = open_zarr(store=icechunk_filestore, group='foo', zarr_format=3, consolidated=False)
+
+        # note: we don't need to test that committing works, because now we have confirmed
+        # the refs are in the store (even uncommitted) it's icechunk's problem to manage them now.
+
+    def test_set_single_virtual_ref_with_encoding(
+        self, icechunk_filestore: "IcechunkStore", netcdf4_file: Path
+    ):
+        # TODO kerchunk doesn't work with zarr-python v3 yet so we can't use open_virtual_dataset and icechunk together!
+        # vds = open_virtual_dataset(netcdf4_file, indexes={})
+
+        # instead for now just write out byte ranges explicitly
+        manifest = ChunkManifest(
+            {"0.0.0": {"path": netcdf4_file, "offset": 15419, "length": 7738000}}
+        )
+        zarray = ZArray(
+            shape=(2920, 25, 53),
+            chunks=(2920, 25, 53),
+            dtype=np.dtype("int16"),
+            compressor=None,
+            filters=None,
+            fill_value=None,
+        )
+        ma = ManifestArray(
+            chunkmanifest=manifest,
+            zarray=zarray,
+        )
+        air = Variable(data=ma, dims=["time", "lat", "lon"], encoding={"scale_factor": 0.01})
+        vds = Dataset(
+            {"air": air},
+        )
+
+        dataset_to_icechunk(vds, icechunk_filestore)
+
+        root_group = group(store=icechunk_filestore)
+        air_array = root_group["air"]
+
+        # check array metadata
+        assert air_array.shape == (2920, 25, 53)
+        assert air_array.chunks == (2920, 25, 53)
+        assert air_array.dtype == np.dtype("int16")
+        assert air_array.attrs['scale_factor'] == 0.01
+
+        # xarray performs this when cf_decoding is True, but we are not loading
+        # with xarray here so we scale it manually.
+        scale_factor = air_array.attrs['scale_factor']
+        scaled_air_array = air_array[:] * scale_factor
+
+        # check chunk references
+        # TODO we can't explicitly check that the path/offset/length is correct because icechunk doesn't yet expose any get_virtual_refs method
+
+        expected_ds = open_dataset(netcdf4_file)
+        expected_air_array = expected_ds["air"].to_numpy()
+        npt.assert_equal(scaled_air_array, expected_air_array)
+
+        # note: we don't need to test that committing works, because now we have confirmed
+        # the refs are in the store (even uncommitted) it's icechunk's problem to manage them now.
+
+
+# TODO test writing grids of multiple chunks
+
+# TODO test writing to a group that isn't the root group
+
+# TODO test writing loadable variables
+
+# TODO roundtripping tests - requires icechunk compatibility with xarray
+
+# TODO test with S3 / minio
diff --git a/virtualizarr/tests/test_writers/test_zarr.py b/virtualizarr/tests/test_writers/test_zarr.py
@@ -1,37 +1,14 @@
 import json
 
-import numpy as np
-import pytest
 import xarray.testing as xrt
 from xarray import Dataset
 
-from virtualizarr import ManifestArray, open_virtual_dataset
+from virtualizarr import open_virtual_dataset
 from virtualizarr.backend import FileType
-from virtualizarr.manifests.manifest import ChunkManifest
 from virtualizarr.readers.zarr import metadata_from_zarr_json
 from virtualizarr.writers.zarr import dataset_to_zarr
 
 
-@pytest.fixture
-def vds_with_manifest_arrays() -> Dataset:
-    arr = ManifestArray(
-        chunkmanifest=ChunkManifest(
-            entries={"0.0": dict(path="test.nc", offset=6144, length=48)}
-        ),
-        zarray=dict(
-            shape=(2, 3),
-            dtype=np.dtype("<i8"),
-            chunks=(2, 3),
-            compressor={"id": "zlib", "level": 1},
-            filters=None,
-            fill_value=0,
-            order="C",
-            zarr_format=3,
-        ),
-    )
-    return Dataset({"a": (["x", "y"], arr)}, attrs={"something": 0})
-
-
 def isconfigurable(value: dict) -> bool:
     """
     Several metadata attributes in ZarrV3 use a dictionary with keys "name" : str and "configuration" : dict