diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7daafb43..0550236f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -12,6 +12,10 @@ on: schedule: - cron: "0 0 * * *" +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: test: diff --git a/.github/workflows/min-deps.yml b/.github/workflows/min-deps.yml new file mode 100644 index 00000000..066e1ba3 --- /dev/null +++ b/.github/workflows/min-deps.yml @@ -0,0 +1,60 @@ +name: min-deps + +on: + push: + branches: [ "main" ] + paths-ignore: + - 'docs/**' + pull_request: + branches: [ "main" ] + paths-ignore: + - 'docs/**' + schedule: + - cron: "0 0 * * *" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + + test: + name: ${{ matrix.python-version }}-build + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + strategy: + matrix: + python-version: ["3.12"] + steps: + - uses: actions/checkout@v4 + + - name: Setup micromamba + uses: mamba-org/setup-micromamba@v1 + with: + environment-file: ci/min-deps.yml + cache-environment: true + create-args: >- + python=${{matrix.python-version}} + + - name: Install virtualizarr + run: | + python -m pip install -e . --no-deps + - name: Conda list information + run: | + conda env list + conda list + + - name: Running Tests + run: | + python -m pytest ./virtualizarr --cov=./ --cov-report=xml --verbose + + - name: Upload code coverage to Codecov + uses: codecov/codecov-action@v3.1.4 + with: + file: ./coverage.xml + flags: unittests + env_vars: OS,PYTHON + name: codecov-umbrella + fail_ci_if_error: false diff --git a/.github/workflows/typing.yml b/.github/workflows/typing.yml new file mode 100644 index 00000000..0540801b --- /dev/null +++ b/.github/workflows/typing.yml @@ -0,0 +1,38 @@ +name: Typing + +on: + push: + branches: [ "main" ] + paths-ignore: + - 'docs/**' + pull_request: + branches: [ "main" ] + paths-ignore: + - 'docs/**' + schedule: + - cron: "0 0 * * *" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + mypy: + name: mypy + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + + - name: Install deps + run: | + # We need to test optional dep to add all the library stubs + pip install -e '.[test]' + + - name: Type check + run: | + mypy virtualizarr diff --git a/.gitignore b/.gitignore index d360cfa4..d6720a7a 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,4 @@ cython_debug/ #.idea/ virtualizarr/_version.py docs/generated/ +examples/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a5670c75..3bae6a6c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ ci: autoupdate_schedule: monthly repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer @@ -11,34 +11,10 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: "v0.4.7" + rev: "v0.6.9" hooks: # Run the linter. - id: ruff args: [ --fix ] # Run the formatter. - id: ruff-format - - - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.10.0 - hooks: - - id: mypy - # Copied from setup.cfg - exclude: "properties|asv_bench|docs" - additional_dependencies: [ - # Type stubs - types-python-dateutil, - types-pkg_resources, - types-PyYAML, - types-pytz, - # Dependencies that are typed - numpy, - typing-extensions>=4.1.0, - ] - # run this occasionally, ref discussion https://github.com/pydata/xarray/pull/3194 - # - repo: https://github.com/asottile/pyupgrade - # rev: v3.15.2 - # hooks: - # - id: pyupgrade - # args: - # - "--py310-plus" diff --git a/README.md b/README.md index c481d542..f415b356 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ VirtualiZarr (pronounced like "virtualize" but more piratey) grew out of [discussions](https://github.com/fsspec/kerchunk/issues/377) on the [kerchunk repository](https://github.com/fsspec/kerchunk), and is an attempt to provide the game-changing power of kerchunk in a zarr-native way, and with a familiar array-like API. +You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk. + _Please see the [documentation](https://virtualizarr.readthedocs.io/en/latest/)_ ### Development Status and Roadmap diff --git a/ci/doc.yml b/ci/doc.yml index 7d7e9224..ccc3ded6 100644 --- a/ci/doc.yml +++ b/ci/doc.yml @@ -13,4 +13,3 @@ dependencies: - "sphinx_design" - "sphinx_togglebutton" - "sphinx-autodoc-typehints" - - -e "..[test]" diff --git a/ci/environment.yml b/ci/environment.yml index a41a99d4..883463a2 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -9,7 +9,6 @@ dependencies: - netcdf4 - xarray>=2024.6.0 - kerchunk>=0.2.5 - - pydantic - numpy>=2.0.0 - ujson - packaging @@ -17,7 +16,9 @@ dependencies: # Testing - codecov - pre-commit + - mypy - ruff + - pandas-stubs - pytest-mypy - pytest-cov - pytest diff --git a/ci/min-deps.yml b/ci/min-deps.yml new file mode 100644 index 00000000..7ca8c0b3 --- /dev/null +++ b/ci/min-deps.yml @@ -0,0 +1,26 @@ +name: virtualizarr-min-deps +channels: + - conda-forge + - nodefaults +dependencies: + - h5netcdf + - h5py + - hdf5 + - netcdf4 + - xarray>=2024.6.0 + - numpy>=2.0.0 + - numcodecs + - packaging + - ujson + - universal_pathlib + # Testing + - codecov + - pre-commit + - mypy + - ruff + - pandas-stubs + - pytest-mypy + - pytest-cov + - pytest + - pooch + - fsspec diff --git a/conftest.py b/conftest.py index 8d5e351e..3af4bf06 100644 --- a/conftest.py +++ b/conftest.py @@ -1,3 +1,4 @@ +import h5py import pytest import xarray as xr @@ -32,6 +33,33 @@ def netcdf4_file(tmpdir): return filepath +@pytest.fixture +def netcdf4_virtual_dataset(netcdf4_file): + from virtualizarr import open_virtual_dataset + + return open_virtual_dataset(netcdf4_file, indexes={}) + + +@pytest.fixture +def netcdf4_inlined_ref(netcdf4_file): + from kerchunk.hdf import SingleHdf5ToZarr + + return SingleHdf5ToZarr(netcdf4_file, inline_threshold=1000).translate() + + +@pytest.fixture +def hdf5_groups_file(tmpdir): + # Set up example xarray dataset + ds = xr.tutorial.open_dataset("air_temperature") + + # Save it to disk as netCDF (in temporary directory) + filepath = f"{tmpdir}/air.nc" + ds.to_netcdf(filepath, format="NETCDF4", group="test/group") + ds.close() + + return filepath + + @pytest.fixture def netcdf4_files(tmpdir): # Set up example xarray dataset @@ -50,3 +78,21 @@ def netcdf4_files(tmpdir): ds2.close() return filepath1, filepath2 + + +@pytest.fixture +def hdf5_empty(tmpdir): + filepath = f"{tmpdir}/empty.nc" + f = h5py.File(filepath, "w") + dataset = f.create_dataset("empty", shape=(), dtype="float32") + dataset.attrs["empty"] = "true" + return filepath + + +@pytest.fixture +def hdf5_scalar(tmpdir): + filepath = f"{tmpdir}/scalar.nc" + f = h5py.File(filepath, "w") + dataset = f.create_dataset("scalar", data=0.1, dtype="float32") + dataset.attrs["scalar"] = "true" + return filepath diff --git a/docs/api.rst b/docs/api.rst index 3dc1d146..81d08a77 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -21,7 +21,7 @@ Manifests Reading ======= -.. currentmodule:: virtualizarr.xarray +.. currentmodule:: virtualizarr.backend .. autosummary:: :nosignatures: :toctree: generated/ @@ -32,7 +32,7 @@ Reading Serialization ============= -.. currentmodule:: virtualizarr.xarray +.. currentmodule:: virtualizarr.accessor .. autosummary:: :nosignatures: :toctree: generated/ @@ -44,7 +44,7 @@ Serialization Rewriting ============= -.. currentmodule:: virtualizarr.xarray +.. currentmodule:: virtualizarr.accessor .. autosummary:: :nosignatures: :toctree: generated/ diff --git a/docs/conf.py b/docs/conf.py index 5ec5ff9d..d5312069 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -55,11 +55,23 @@ html_theme = "pydata_sphinx_theme" html_theme_options = { - "repository_url": "https://github.com/TomNicholas/VirtualiZarr", - "repository_branch": "main", - "path_to_docs": "docs", + "use_edit_page_button": True, + "icon_links": [ + { + "name": "GitHub", + "url": "https://github.com/zarr-developers/VirtualiZarr", + "icon": "fa-brands fa-github", + "type": "fontawesome", + }, + ] } html_title = "VirtualiZarr" +html_context = { + "github_user": "zarr-developers", + "github_repo": "VirtualiZarr", + "github_version": "main", + "doc_path": "docs", +} # remove sidebar, see GH issue #82 html_css_files = [ diff --git a/docs/faq.md b/docs/faq.md index df4af749..d273a529 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -16,6 +16,8 @@ The above steps would also be performed using the `kerchunk` library alone, but ## How do VirtualiZarr and Kerchunk compare? +You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk. + Users of kerchunk may find the following comparison table useful, which shows which features of kerchunk map on to which features of VirtualiZarr. | Component / Feature | Kerchunk | VirtualiZarr | | ------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | diff --git a/docs/index.md b/docs/index.md index d1beb291..0e79418f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -4,6 +4,8 @@ VirtualiZarr grew out of [discussions](https://github.com/fsspec/kerchunk/issues/377) on the [kerchunk repository](https://github.com/fsspec/kerchunk), and is an attempt to provide the game-changing power of kerchunk in a zarr-native way, and with a familiar array-like API. +You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk. + ## Motivation The Kerchunk idea solves an incredibly important problem: accessing big archival datasets via a cloud-optimized pattern, but without copying or modifying the original data in any way. This is a win-win-win for users, data engineers, and data providers. Users see fast-opening zarr-compliant stores that work performantly with libraries like xarray and dask, data engineers can provide this speed by adding a lightweight virtualization layer on top of existing data (without having to ask anyone's permission), and data providers don't have to change anything about their legacy files for them to be used in a cloud-optimized way. diff --git a/docs/installation.md b/docs/installation.md index b4c01d57..03272a2f 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -1,11 +1,15 @@ # Installation -Currently you need to clone VirtualiZarr and install it locally: +VirtualiZarr is available on PyPI via pip: ```shell -git clone https://github.com/zarr-developers/VirtualiZarr -cd VirtualiZarr -pip install -e . +pip install virtualizarr +``` + +and on conda-forge: + +```shell +conda install -c conda-forge virtualizarr ``` diff --git a/docs/releases.rst b/docs/releases.rst index 8bbd1089..8e29154e 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -9,24 +9,70 @@ v1.0.1 (unreleased) New Features ~~~~~~~~~~~~ + +- Can open `kerchunk` reference files with ``open_virtual_dataset``. + (:pull:`251`, :pull:`186`) By `Raphael Hagen `_ & `Kristen Thyng `_. + +- Adds defaults for `open_virtual_dataset_from_v3_store` in (:pull:`234`) + By `Raphael Hagen `_. + +- New ``group`` option on ``open_virtual_dataset`` enables extracting specific HDF Groups. + (:pull:`165`) By `Scott Henderson `_. + +- Adds `decode_times` to open_virtual_dataset (:pull:`232`) + By `Raphael Hagen `_. + +- Add parser for the OPeNDAP DMR++ XML format and integration with open_virtual_dataset (:pull:`113`) + By `Ayush Nag `_. + +- Load scalar variables by default. (:pull:`205`) + By `Gustavo Hidalgo `_. + +- Support empty files (:pull:`260`) + By `Justus Magin `_. + Breaking changes ~~~~~~~~~~~~~~~~ - Requires Zarr 3.0 (for :pull:`182`). By `Gustavo Hidalgo `_. +- Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`) + By `Gustavo Hidalgo `_. +- VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass + `pydantic.BaseModel` (:pull:`210`) +- `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`210`) + Deprecations ~~~~~~~~~~~~ +- Depreciates cftime_variables in open_virtual_dataset in favor of decode_times. (:pull:`232`) + By `Raphael Hagen `_. + Bug fixes ~~~~~~~~~ +- Exclude empty chunks during `ChunkDict` construction. (:pull:`198`) + By `Gustavo Hidalgo `_. +- Fixed regression in `fill_value` handling for datetime dtypes making virtual + Zarr stores unreadable (:pull:`206`) + By `Timothy Hodson `_ + Documentation ~~~~~~~~~~~~~ +- Adds virtualizarr + coiled serverless example notebook (:pull:`223`) + By `Raphael Hagen `_. + + Internal Changes ~~~~~~~~~~~~~~~~ +- Refactored internal structure significantly to split up everything to do with reading references from that to do with writing references. + (:issue:`229`) (:pull:`231`) By `Tom Nicholas `_. +- Refactored readers to consider every filetype as a separate reader, all standardized to present the same `open_virtual_dataset` interface internally. + (:pull:`261`) By `Tom Nicholas `_. + .. _v1.0.0: v1.0.0 (9th July 2024) diff --git a/docs/usage.md b/docs/usage.md index b0935286..a0f9d058 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -306,8 +306,8 @@ Dimensions: (time: 2920, lat: 25, lon: 53) Coordinates: lat (lat) float32 100B ManifestArray=2024.06.0", - "kerchunk>=0.2.5", - "h5netcdf", - "pydantic", "numpy>=2.0.0", - "ujson", "packaging", "universal-pathlib", "zarr>=3.0.0a0" + "numcodecs", + "ujson", ] [project.optional-dependencies] test = [ "codecov", + "fastparquet", + "fsspec", + "h5netcdf", + "h5py", + "kerchunk>=0.2.5", + "mypy", + "netcdf4", + "pandas-stubs", + "pooch", "pre-commit", - "ruff", - "pytest-mypy", "pytest-cov", + "pytest-mypy", "pytest", - "pooch", - "scipy", - "netcdf4", - "fsspec", + "ruff", "s3fs", - "fastparquet", + "scipy", ] @@ -70,12 +73,26 @@ exclude = ["docs", "tests", "tests.*", "docs.*"] [tool.setuptools.package-data] datatree = ["py.typed"] - - -[mypy] +[tool.mypy] files = "virtualizarr/**/*.py" show_error_codes = true +[[tool.mypy.overrides]] +module = "fsspec.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "numcodecs.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "kerchunk.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "ujson.*" +ignore_missing_imports = true + [tool.ruff] # Same as Black. line-length = 88 diff --git a/virtualizarr/__init__.py b/virtualizarr/__init__.py index 11bdae6e..bd70f834 100644 --- a/virtualizarr/__init__.py +++ b/virtualizarr/__init__.py @@ -1,6 +1,6 @@ -from .manifests import ChunkManifest, ManifestArray # type: ignore # noqa -from .xarray import VirtualiZarrDatasetAccessor # type: ignore # noqa -from .xarray import open_virtual_dataset # noqa: F401 +from virtualizarr.manifests import ChunkManifest, ManifestArray # type: ignore # noqa +from virtualizarr.accessor import VirtualiZarrDatasetAccessor # type: ignore # noqa +from virtualizarr.backend import open_virtual_dataset # noqa: F401 from importlib.metadata import version as _version diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py new file mode 100644 index 00000000..cc251e63 --- /dev/null +++ b/virtualizarr/accessor.py @@ -0,0 +1,167 @@ +from pathlib import Path +from typing import ( + Callable, + Literal, + overload, +) + +from xarray import Dataset, register_dataset_accessor + +from virtualizarr.manifests import ManifestArray +from virtualizarr.types.kerchunk import KerchunkStoreRefs +from virtualizarr.writers.kerchunk import dataset_to_kerchunk_refs +from virtualizarr.writers.zarr import dataset_to_zarr + + +@register_dataset_accessor("virtualize") +class VirtualiZarrDatasetAccessor: + """ + Xarray accessor for writing out virtual datasets to disk. + + Methods on this object are called via `ds.virtualize.{method}`. + """ + + def __init__(self, ds: Dataset): + self.ds: Dataset = ds + + def to_zarr(self, storepath: str) -> None: + """ + Serialize all virtualized arrays in this xarray dataset as a Zarr store. + + Currently requires all variables to be backed by ManifestArray objects. + + Not very useful until some implementation of a Zarr reader can actually read these manifest.json files. + See https://github.com/zarr-developers/zarr-specs/issues/287 + + Parameters + ---------- + storepath : str + """ + dataset_to_zarr(self.ds, storepath) + + @overload + def to_kerchunk( + self, filepath: None, format: Literal["dict"] + ) -> KerchunkStoreRefs: ... + + @overload + def to_kerchunk(self, filepath: str | Path, format: Literal["json"]) -> None: ... + + @overload + def to_kerchunk( + self, + filepath: str | Path, + format: Literal["parquet"], + record_size: int = 100_000, + categorical_threshold: int = 10, + ) -> None: ... + + def to_kerchunk( + self, + filepath: str | Path | None = None, + format: Literal["dict", "json", "parquet"] = "dict", + record_size: int = 100_000, + categorical_threshold: int = 10, + ) -> KerchunkStoreRefs | None: + """ + Serialize all virtualized arrays in this xarray dataset into the kerchunk references format. + + Parameters + ---------- + filepath : str, default: None + File path to write kerchunk references into. Not required if format is 'dict'. + format : 'dict', 'json', or 'parquet' + Format to serialize the kerchunk references as. + If 'json' or 'parquet' then the 'filepath' argument is required. + record_size (parquet only): int + Number of references to store in each reference file (default 100,000). Bigger values + mean fewer read requests but larger memory footprint. + categorical_threshold (parquet only) : int + Encode urls as pandas.Categorical to reduce memory footprint if the ratio + of the number of unique urls to total number of refs for each variable + is greater than or equal to this number. (default 10) + + References + ---------- + https://fsspec.github.io/kerchunk/spec.html + """ + refs = dataset_to_kerchunk_refs(self.ds) + + if format == "dict": + return refs + elif format == "json": + import ujson + + if filepath is None: + raise ValueError("Filepath must be provided when format is 'json'") + + with open(filepath, "w") as json_file: + ujson.dump(refs, json_file) + + return None + elif format == "parquet": + from kerchunk.df import refs_to_dataframe + + if isinstance(filepath, Path): + url = str(filepath) + elif isinstance(filepath, str): + url = filepath + + # refs_to_dataframe is responsible for writing to parquet. + # at no point does it create a full in-memory dataframe. + refs_to_dataframe( + refs, + url=url, + record_size=record_size, + categorical_threshold=categorical_threshold, + ) + return None + else: + raise ValueError(f"Unrecognized output format: {format}") + + def rename_paths( + self, + new: str | Callable[[str], str], + ) -> Dataset: + """ + Rename paths to chunks in every ManifestArray in this dataset. + + Accepts either a string, in which case this new path will be used for all chunks, or + a function which accepts the old path and returns the new path. + + Parameters + ---------- + new + New path to use for all chunks, either as a string, or as a function which accepts and returns strings. + + Returns + ------- + Dataset + + Examples + -------- + Rename paths to reflect moving the referenced files from local storage to an S3 bucket. + + >>> def local_to_s3_url(old_local_path: str) -> str: + ... from pathlib import Path + ... + ... new_s3_bucket_url = "http://s3.amazonaws.com/my_bucket/" + ... + ... filename = Path(old_local_path).name + ... return str(new_s3_bucket_url / filename) + + >>> ds.virtualize.rename_paths(local_to_s3_url) + + See Also + -------- + ManifestArray.rename_paths + ChunkManifest.rename_paths + """ + + new_ds = self.ds.copy() + for var_name in new_ds.variables: + data = new_ds[var_name].data + if isinstance(data, ManifestArray): + new_ds[var_name].data = data.rename_paths(new=new) + + return new_ds diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py new file mode 100644 index 00000000..0322f604 --- /dev/null +++ b/virtualizarr/backend.py @@ -0,0 +1,201 @@ +import warnings +from collections.abc import Iterable, Mapping +from enum import Enum, auto +from pathlib import Path +from typing import ( + Any, + Optional, +) + +from xarray import Dataset +from xarray.core.indexes import Index + +from virtualizarr.manifests import ManifestArray +from virtualizarr.readers import ( + DMRPPVirtualBackend, + FITSVirtualBackend, + HDF5VirtualBackend, + KerchunkVirtualBackend, + NetCDF3VirtualBackend, + TIFFVirtualBackend, + ZarrV3VirtualBackend, +) +from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions + +# TODO add entrypoint to allow external libraries to add to this mapping +VIRTUAL_BACKENDS = { + "kerchunk": KerchunkVirtualBackend, + "zarr_v3": ZarrV3VirtualBackend, + "dmrpp": DMRPPVirtualBackend, + # all the below call one of the kerchunk backends internally (https://fsspec.github.io/kerchunk/reference.html#file-format-backends) + "netcdf3": NetCDF3VirtualBackend, + "hdf5": HDF5VirtualBackend, + "netcdf4": HDF5VirtualBackend, # note this is the same as for hdf5 + "tiff": TIFFVirtualBackend, + "fits": FITSVirtualBackend, +} + + +class AutoName(Enum): + # Recommended by official Python docs for auto naming: + # https://docs.python.org/3/library/enum.html#using-automatic-values + def _generate_next_value_(name, start, count, last_values): + return name + + +class FileType(AutoName): + netcdf3 = auto() + netcdf4 = auto() # NOTE: netCDF4 is a subset of hdf5 + hdf4 = auto() + hdf5 = auto() + grib = auto() + tiff = auto() + fits = auto() + zarr = auto() + dmrpp = auto() + zarr_v3 = auto() + kerchunk = auto() + + +def automatically_determine_filetype( + *, + filepath: str, + reader_options: Optional[dict[str, Any]] = {}, +) -> FileType: + """ + Attempt to automatically infer the correct reader for this filetype. + + Uses magic bytes and file / directory suffixes. + """ + + # TODO this should ideally handle every filetype that we have a reader for, not just kerchunk + + # TODO how do we handle kerchunk json / parquet here? + if Path(filepath).suffix == ".zarr": + # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one... + raise NotImplementedError() + + # Read magic bytes from local or remote file + fpath = _FsspecFSFromFilepath( + filepath=filepath, reader_options=reader_options + ).open_file() + magic_bytes = fpath.read(8) + fpath.close() + + if magic_bytes.startswith(b"CDF"): + filetype = FileType.netcdf3 + elif magic_bytes.startswith(b"\x0e\x03\x13\x01"): + raise NotImplementedError("HDF4 formatted files not supported") + elif magic_bytes.startswith(b"\x89HDF"): + filetype = FileType.hdf5 + elif magic_bytes.startswith(b"GRIB"): + filetype = FileType.grib + elif magic_bytes.startswith(b"II*"): + filetype = FileType.tiff + elif magic_bytes.startswith(b"SIMPLE"): + filetype = FileType.fits + else: + raise NotImplementedError( + f"Unrecognised file based on header bytes: {magic_bytes}" + ) + + return filetype + + +def open_virtual_dataset( + filepath: str, + *, + filetype: FileType | None = None, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + cftime_variables: Iterable[str] | None = None, + indexes: Mapping[str, Index] | None = None, + virtual_array_class=ManifestArray, + reader_options: Optional[dict] = None, +) -> Dataset: + """ + Open a file or store as an xarray Dataset wrapping virtualized zarr arrays. + + No data variables will be loaded unless specified in the ``loadable_variables`` kwarg (in which case they will be xarray lazily indexed arrays). + + Xarray indexes can optionally be created (the default behaviour). To avoid creating any xarray indexes pass ``indexes={}``. + + Parameters + ---------- + filepath : str, default None + File path to open as a set of virtualized zarr arrays. + filetype : FileType, default None + Type of file to be opened. Used to determine which kerchunk file format backend to use. + Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3', 'kerchunk'}. + If not provided will attempt to automatically infer the correct filetype from header bytes. + group : str, default is None + Path to the HDF5/netCDF4 group in the given file to open. Given as a str, supported by filetypes “netcdf4” and “hdf5”. + drop_variables: list[str], default is None + Variables in the file to drop before returning. + loadable_variables: list[str], default is None + Variables in the file to open as lazy numpy/dask arrays instead of instances of virtual_array_class. + Default is to open all variables as virtual arrays (i.e. ManifestArray). + decode_times: bool | None, default is None + Bool that is passed into Xarray's open_dataset. Allows time to be decoded into a datetime object. + indexes : Mapping[str, Index], default is None + Indexes to use on the returned xarray Dataset. + Default is None, which will read any 1D coordinate data to create in-memory Pandas indexes. + To avoid creating any indexes, pass indexes={}. + virtual_array_class + Virtual array class to use to represent the references to the chunks in each on-disk array. + Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. + reader_options: dict, default {} + Dict passed into Kerchunk file readers, to allow reading from remote filesystems. + Note: Each Kerchunk file reader has distinct arguments, so ensure reader_options match selected Kerchunk reader arguments. + + Returns + ------- + vds + An xarray Dataset containing instances of virtual_array_cls for each variable, or normal lazily indexed arrays for each variable in loadable_variables. + """ + + if cftime_variables is not None: + # It seems like stacklevel=2 is req to surface this warning. + warnings.warn( + "cftime_variables is deprecated and will be ignored. Pass decode_times=True and loadable_variables=['time'] to decode time values to datetime objects.", + DeprecationWarning, + stacklevel=2, + ) + + drop_variables, loadable_variables = check_for_collisions( + drop_variables, + loadable_variables, + ) + + if virtual_array_class is not ManifestArray: + raise NotImplementedError() + + if reader_options is None: + reader_options = {} + + if filetype is not None: + # if filetype is user defined, convert to FileType + filetype = FileType(filetype) + else: + filetype = automatically_determine_filetype( + filepath=filepath, reader_options=reader_options + ) + + backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower()) + + if backend_cls is None: + raise NotImplementedError(f"Unsupported file type: {filetype.name}") + + vds = backend_cls.open_virtual_dataset( + filepath, + group=group, + drop_variables=drop_variables, + loadable_variables=loadable_variables, + decode_times=decode_times, + indexes=indexes, + reader_options=reader_options, + ) + + return vds diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py deleted file mode 100644 index 6e82067d..00000000 --- a/virtualizarr/kerchunk.py +++ /dev/null @@ -1,314 +0,0 @@ -import base64 -import json -import warnings -from enum import Enum, auto -from pathlib import Path -from typing import Any, NewType, Optional, cast - -import numpy as np -import ujson # type: ignore -import xarray as xr -from xarray.coding.times import CFDatetimeCoder - -from virtualizarr.manifests.manifest import join -from virtualizarr.utils import _fsspec_openfile_from_filepath -from virtualizarr.zarr import ZArray, ZAttrs - -# Distinguishing these via type hints makes it a lot easier to mentally keep track of what the opaque kerchunk "reference dicts" actually mean -# (idea from https://kobzol.github.io/rust/python/2023/05/20/writing-python-like-its-rust.html) -# TODO I would prefer to be more specific about these types -KerchunkStoreRefs = NewType( - "KerchunkStoreRefs", dict -) # top-level dict with keys for 'version', 'refs' -KerchunkArrRefs = NewType( - "KerchunkArrRefs", - dict, -) # lower-level dict containing just the information for one zarr array - - -class AutoName(Enum): - # Recommended by official Python docs for auto naming: - # https://docs.python.org/3/library/enum.html#using-automatic-values - def _generate_next_value_(name, start, count, last_values): - return name - - -class FileType(AutoName): - netcdf3 = auto() - netcdf4 = auto() # NOTE: netCDF4 is a subset of hdf5 - hdf4 = auto() - hdf5 = auto() - grib = auto() - tiff = auto() - fits = auto() - zarr = auto() - - -class NumpyEncoder(json.JSONEncoder): - # TODO I don't understand how kerchunk gets around this problem of encoding numpy types (in the zattrs) whilst only using ujson - def default(self, obj): - if isinstance(obj, np.ndarray): - return obj.tolist() # Convert NumPy array to Python list - elif isinstance(obj, np.generic): - return obj.item() # Convert NumPy scalar to Python scalar - elif isinstance(obj, np.dtype): - return str(obj) - return json.JSONEncoder.default(self, obj) - - -def read_kerchunk_references_from_file( - filepath: str, - filetype: FileType | None, - reader_options: Optional[dict[str, Any]] = None, -) -> KerchunkStoreRefs: - """ - Read a single legacy file and return kerchunk references to its contents. - - Parameters - ---------- - filepath : str, default: None - File path to open as a set of virtualized zarr arrays. - filetype : FileType, default: None - Type of file to be opened. Used to determine which kerchunk file format backend to use. - If not provided will attempt to automatically infer the correct filetype from the the filepath's extension. - reader_options: dict, default {'storage_options':{'key':'', 'secret':'', 'anon':True}} - Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments, - so ensure reader_options match selected Kerchunk reader arguments. - """ - - if filetype is None: - filetype = _automatically_determine_filetype( - filepath=filepath, reader_options=reader_options - ) - - if reader_options is None: - reader_options = {} - - # if filetype is user defined, convert to FileType - filetype = FileType(filetype) - - if filetype.name.lower() == "netcdf3": - from kerchunk.netCDF3 import NetCDF3ToZarr - - refs = NetCDF3ToZarr(filepath, inline_threshold=0, **reader_options).translate() - - elif filetype.name.lower() == "hdf5" or filetype.name.lower() == "netcdf4": - from kerchunk.hdf import SingleHdf5ToZarr - - refs = SingleHdf5ToZarr( - filepath, inline_threshold=0, **reader_options - ).translate() - elif filetype.name.lower() == "grib": - # TODO Grib files should be handled as a DataTree object - # see https://github.com/TomNicholas/VirtualiZarr/issues/11 - raise NotImplementedError(f"Unsupported file type: {filetype}") - elif filetype.name.lower() == "tiff": - from kerchunk.tiff import tiff_to_zarr - - reader_options.pop("storage_options", {}) - warnings.warn( - "storage_options have been dropped from reader_options as they are not supported by kerchunk.tiff.tiff_to_zarr", - UserWarning, - ) - - # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 - refs = {"refs": tiff_to_zarr(filepath, **reader_options)} - elif filetype.name.lower() == "fits": - from kerchunk.fits import process_file - - # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 - refs = {"refs": process_file(filepath, **reader_options)} - else: - raise NotImplementedError(f"Unsupported file type: {filetype.name}") - - # TODO validate the references that were read before returning? - return refs - - -def _automatically_determine_filetype( - *, - filepath: str, - reader_options: Optional[dict[str, Any]] = None, -) -> FileType: - if Path(filepath).suffix == ".zarr": - # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one... - raise NotImplementedError() - - # Read magic bytes from local or remote file - fpath = _fsspec_openfile_from_filepath( - filepath=filepath, reader_options=reader_options - ) - magic_bytes = fpath.read(8) - fpath.close() - - if magic_bytes.startswith(b"CDF"): - filetype = FileType.netcdf3 - elif magic_bytes.startswith(b"\x0e\x03\x13\x01"): - raise NotImplementedError("HDF4 formatted files not supported") - elif magic_bytes.startswith(b"\x89HDF"): - filetype = FileType.hdf5 - elif magic_bytes.startswith(b"GRIB"): - filetype = FileType.grib - elif magic_bytes.startswith(b"II*"): - filetype = FileType.tiff - elif magic_bytes.startswith(b"SIMPLE"): - filetype = FileType.fits - else: - raise NotImplementedError( - f"Unrecognised file based on header bytes: {magic_bytes}" - ) - - return filetype - - -def find_var_names(ds_reference_dict: KerchunkStoreRefs) -> list[str]: - """Find the names of zarr variables in this store/group.""" - - refs = ds_reference_dict["refs"] - found_var_names = {key.split("/")[0] for key in refs.keys() if "/" in key} - return list(found_var_names) - - -def extract_array_refs( - ds_reference_dict: KerchunkStoreRefs, var_name: str -) -> KerchunkArrRefs: - """Extract only the part of the kerchunk reference dict that is relevant to this one zarr array""" - - found_var_names = find_var_names(ds_reference_dict) - - refs = ds_reference_dict["refs"] - if var_name in found_var_names: - # TODO these function probably have more loops in them than they need to... - - arr_refs = { - key.split("/")[1]: refs[key] - for key in refs.keys() - if var_name == key.split("/")[0] - } - - return fully_decode_arr_refs(arr_refs) - else: - raise KeyError( - f"Could not find zarr array variable name {var_name}, only {found_var_names}" - ) - - -def parse_array_refs( - arr_refs: KerchunkArrRefs, -) -> tuple[dict, ZArray, ZAttrs]: - zarray = ZArray.from_kerchunk_refs(arr_refs.pop(".zarray")) - zattrs = arr_refs.pop(".zattrs", {}) - chunk_dict = arr_refs - - return chunk_dict, zarray, zattrs - - -def fully_decode_arr_refs(d: dict) -> KerchunkArrRefs: - """ - Only have to do this because kerchunk.SingleHdf5ToZarr apparently doesn't bother converting .zarray and .zattrs contents to dicts, see https://github.com/fsspec/kerchunk/issues/415 . - """ - sanitized = d.copy() - for k, v in d.items(): - if k.startswith("."): - # ensure contents of .zattrs and .zarray are python dictionaries - sanitized[k] = ujson.loads(v) - - return cast(KerchunkArrRefs, sanitized) - - -def dataset_to_kerchunk_refs(ds: xr.Dataset) -> KerchunkStoreRefs: - """ - Create a dictionary containing kerchunk-style store references from a single xarray.Dataset (which wraps ManifestArray objects). - """ - - all_arr_refs = {} - for var_name, var in ds.variables.items(): - arr_refs = variable_to_kerchunk_arr_refs(var, var_name) - - prepended_with_var_name = { - f"{var_name}/{key}": val for key, val in arr_refs.items() - } - - all_arr_refs.update(prepended_with_var_name) - - zattrs = ds.attrs - if ds.coords: - coord_names = list(ds.coords) - # this weird concatenated string instead of a list of strings is inconsistent with how other features in the kerchunk references format are stored - # see https://github.com/zarr-developers/VirtualiZarr/issues/105#issuecomment-2187266739 - zattrs["coordinates"] = " ".join(coord_names) - - ds_refs = { - "version": 1, - "refs": { - ".zgroup": '{"zarr_format":2}', - ".zattrs": ujson.dumps(zattrs), - **all_arr_refs, - }, - } - - return cast(KerchunkStoreRefs, ds_refs) - - -def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkArrRefs: - """ - Create a dictionary containing kerchunk-style array references from a single xarray.Variable (which wraps either a ManifestArray or a numpy array). - - Partially encodes the inner dicts to json to match kerchunk behaviour (see https://github.com/fsspec/kerchunk/issues/415). - """ - from virtualizarr.manifests import ManifestArray - - if isinstance(var.data, ManifestArray): - marr = var.data - - arr_refs: dict[str, str | list[str | int]] = { - str(chunk_key): [entry["path"], entry["offset"], entry["length"]] - for chunk_key, entry in marr.manifest.dict().items() - } - - zarray = marr.zarray - - else: - try: - np_arr = var.to_numpy() - except AttributeError as e: - raise TypeError( - f"Can only serialize wrapped arrays of type ManifestArray or numpy.ndarray, but got type {type(var.data)}" - ) from e - - if var.encoding: - if "scale_factor" in var.encoding: - raise NotImplementedError( - f"Cannot serialize loaded variable {var_name}, as it is encoded with a scale_factor" - ) - if "offset" in var.encoding: - raise NotImplementedError( - f"Cannot serialize loaded variable {var_name}, as it is encoded with an offset" - ) - if "calendar" in var.encoding: - np_arr = CFDatetimeCoder().encode(var.copy(), name=var_name).values - - # This encoding is what kerchunk does when it "inlines" data, see https://github.com/fsspec/kerchunk/blob/a0c4f3b828d37f6d07995925b324595af68c4a19/kerchunk/hdf.py#L472 - byte_data = np_arr.tobytes() - # TODO do I really need to encode then decode like this? - inlined_data = (b"base64:" + base64.b64encode(byte_data)).decode("utf-8") - - # TODO can this be generalized to save individual chunks of a dask array? - # TODO will this fail for a scalar? - arr_refs = {join(0 for _ in np_arr.shape): inlined_data} - - zarray = ZArray( - chunks=np_arr.shape, - shape=np_arr.shape, - dtype=np_arr.dtype, - order="C", - ) - - zarray_dict = zarray.to_kerchunk_json() - arr_refs[".zarray"] = zarray_dict - - zattrs = {**var.attrs, **var.encoding} - zattrs["_ARRAY_DIMENSIONS"] = list(var.dims) - arr_refs[".zattrs"] = json.dumps(zattrs, separators=(",", ":"), cls=NumpyEncoder) - - return cast(KerchunkArrRefs, arr_refs) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index a0983dec..179bcf1c 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -3,10 +3,13 @@ import numpy as np -from ..kerchunk import KerchunkArrRefs -from ..zarr import ZArray -from .array_api import MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS -from .manifest import ChunkManifest +from virtualizarr.manifests.array_api import ( + MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS, + _isnan, +) +from virtualizarr.manifests.manifest import ChunkManifest +from virtualizarr.types.kerchunk import KerchunkArrRefs +from virtualizarr.zarr import ZArray class ManifestArray: @@ -61,7 +64,10 @@ def __init__( @classmethod def _from_kerchunk_refs(cls, arr_refs: KerchunkArrRefs) -> "ManifestArray": - from virtualizarr.kerchunk import fully_decode_arr_refs, parse_array_refs + from virtualizarr.translators.kerchunk import ( + fully_decode_arr_refs, + parse_array_refs, + ) decoded_arr_refs = fully_decode_arr_refs(arr_refs) @@ -127,9 +133,11 @@ def __array_function__(self, func, types, args, kwargs) -> Any: def __array_ufunc__(self, ufunc, method, *inputs, **kwargs) -> Any: """We have to define this in order to convince xarray that this class is a duckarray, even though we will never support ufuncs.""" + if ufunc == np.isnan: + return _isnan(self.shape) return NotImplemented - def __array__(self) -> np.ndarray: + def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray: raise NotImplementedError( "ManifestArrays can't be converted into numpy arrays or pandas Index objects" ) diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 0ecdc023..f5cf220b 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -1,8 +1,8 @@ -from typing import TYPE_CHECKING, Callable, Iterable +from typing import TYPE_CHECKING, Any, Callable, Iterable, cast import numpy as np -from virtualizarr.zarr import Codec, ceildiv +from virtualizarr.zarr import Codec, determine_chunk_grid_shape from .manifest import ChunkManifest @@ -217,9 +217,12 @@ def stack( new_shape.insert(axis, length_along_new_stacked_axis) # do stacking of entries in manifest - stacked_paths = np.stack( - [arr.manifest._paths for arr in arrays], - axis=axis, + stacked_paths = cast( # `np.stack` apparently is type hinted as if the output could have Any dtype + np.ndarray[Any, np.dtypes.StringDType], + np.stack( + [arr.manifest._paths for arr in arrays], + axis=axis, + ), ) stacked_offsets = np.stack( [arr.manifest._offsets for arr in arrays], @@ -290,16 +293,17 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra ) # find new chunk grid shape by dividing new array shape by new chunk shape - new_chunk_grid_shape = tuple( - ceildiv(axis_length, chunk_length) - for axis_length, chunk_length in zip(new_shape, new_chunk_shape) - ) + new_chunk_grid_shape = determine_chunk_grid_shape(new_shape, new_chunk_shape) # do broadcasting of entries in manifest - broadcasted_paths = np.broadcast_to( - x.manifest._paths, - shape=new_chunk_grid_shape, + broadcasted_paths = cast( # `np.broadcast_to` apparently is type hinted as if the output could have Any dtype + np.ndarray[Any, np.dtypes.StringDType], + np.broadcast_to( + x.manifest._paths, + shape=new_chunk_grid_shape, + ), ) + broadcasted_offsets = np.broadcast_to( x.manifest._offsets, shape=new_chunk_grid_shape, @@ -356,8 +360,8 @@ def isnan(x: "ManifestArray", /) -> np.ndarray: Only implemented to get past some checks deep inside xarray, see https://github.com/TomNicholas/VirtualiZarr/issues/29. """ - return np.full( - shape=x.shape, - fill_value=False, - dtype=np.dtype(bool), - ) + return _isnan(x.shape) + + +def _isnan(shape: tuple): + return np.full(shape=shape, fill_value=False, dtype=np.dtype(bool)) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index cc196e6d..1933844a 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -1,10 +1,10 @@ +import dataclasses import json import re from collections.abc import Iterable, Iterator -from typing import Any, Callable, NewType, Tuple, Union, cast +from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast import numpy as np -from pydantic import BaseModel, ConfigDict from virtualizarr.types import ChunkKey @@ -15,36 +15,51 @@ _CHUNK_KEY = rf"^{_INTEGER}+({_SEPARATOR}{_INTEGER})*$" # matches 1 integer, optionally followed by more integers each separated by a separator (i.e. a period) -ChunkDict = NewType("ChunkDict", dict[ChunkKey, dict[str, Union[str, int]]]) +class ChunkDictEntry(TypedDict): + path: str + offset: int + length: int + + +ChunkDict = NewType("ChunkDict", dict[ChunkKey, ChunkDictEntry]) -class ChunkEntry(BaseModel): +@dataclasses.dataclass(frozen=True) +class ChunkEntry: """ Information for a single chunk in the manifest. Stored in the form `{"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}`. """ - model_config = ConfigDict(frozen=True) - path: str # TODO stricter typing/validation of possible local / remote paths? offset: int length: int - def __repr__(self) -> str: - return f"ChunkEntry(path='{self.path}', offset={self.offset}, length={self.length})" - @classmethod - def from_kerchunk(cls, path_and_byte_range_info: list[str | int]) -> "ChunkEntry": - path, offset, length = path_and_byte_range_info + def from_kerchunk( + cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int] + ) -> "ChunkEntry": + from upath import UPath + + if len(path_and_byte_range_info) == 1: + path = path_and_byte_range_info[0] + offset = 0 + length = UPath(path).stat().st_size + else: + path, offset, length = path_and_byte_range_info return ChunkEntry(path=path, offset=offset, length=length) - def to_kerchunk(self) -> list[str | int]: + def to_kerchunk(self) -> tuple[str, int, int]: """Write out in the format that kerchunk uses for chunk entries.""" - return [self.path, self.offset, self.length] + return (self.path, self.offset, self.length) - def dict(self) -> dict[str, Union[str, int]]: - return dict(path=self.path, offset=self.offset, length=self.length) + def dict(self) -> ChunkDictEntry: + return ChunkDictEntry( + path=self.path, + offset=self.offset, + length=self.length, + ) class ChunkManifest: @@ -70,11 +85,11 @@ class ChunkManifest: so it's not possible to have a ChunkManifest object that does not represent a valid grid of chunks. """ - _paths: np.ndarray[Any, np.dtypes.StringDType] # type: ignore[name-defined] + _paths: np.ndarray[Any, np.dtypes.StringDType] _offsets: np.ndarray[Any, np.dtype[np.uint64]] _lengths: np.ndarray[Any, np.dtype[np.uint64]] - def __init__(self, entries: dict) -> None: + def __init__(self, entries: dict, shape: tuple[int, ...] | None = None) -> None: """ Create a ChunkManifest from a dictionary mapping zarr chunk keys to byte ranges. @@ -90,16 +105,20 @@ def __init__(self, entries: dict) -> None: "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, } """ + if shape is None and not entries: + raise ValueError("need a chunk grid shape if no chunks given") # TODO do some input validation here first? validate_chunk_keys(entries.keys()) - # TODO should we actually optionally pass chunk grid shape in, - # in case there are not enough chunks to give correct idea of full shape? - shape = get_chunk_grid_shape(entries.keys()) + if shape is None: + shape = get_chunk_grid_shape(entries.keys()) # Initializing to empty implies that entries with path='' are treated as missing chunks - paths = np.empty(shape=shape, dtype=np.dtypes.StringDType()) # type: ignore[attr-defined] + paths = cast( # `np.empty` apparently is type hinted as if the output could have Any dtype + np.ndarray[Any, np.dtypes.StringDType], + np.empty(shape=shape, dtype=np.dtypes.StringDType()), + ) offsets = np.empty(shape=shape, dtype=np.dtype("uint64")) lengths = np.empty(shape=shape, dtype=np.dtype("uint64")) @@ -127,7 +146,7 @@ def __init__(self, entries: dict) -> None: @classmethod def from_arrays( cls, - paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]], # type: ignore[name-defined] + paths: np.ndarray[Any, np.dtypes.StringDType], offsets: np.ndarray[Any, np.dtype[np.uint64]], lengths: np.ndarray[Any, np.dtype[np.uint64]], ) -> "ChunkManifest": @@ -224,7 +243,7 @@ def __iter__(self) -> Iterator[ChunkKey]: def __len__(self) -> int: return self._paths.size - def dict(self) -> ChunkDict: + def dict(self) -> ChunkDict: # type: ignore[override] """ Convert the entire manifest to a nested dictionary. @@ -252,7 +271,7 @@ def dict(self) -> ChunkDict: [*coord_vectors, self._paths, self._offsets, self._lengths], flags=("refs_ok",), ) - if path.item()[0] != "" # don't include entry if path='' (i.e. empty chunk) + if path.item() != "" # don't include entry if path='' (i.e. empty chunk) } return cast( @@ -283,12 +302,22 @@ def to_zarr_json(self, filepath: str) -> None: json.dump(entries, json_file, indent=4, separators=(", ", ": ")) @classmethod - def _from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest": - chunkentries = { - cast(ChunkKey, k): ChunkEntry.from_kerchunk(v).dict() - for k, v in kerchunk_chunk_dict.items() - } - return ChunkManifest(entries=cast(ChunkDict, chunkentries)) + def _from_kerchunk_chunk_dict( + cls, + # The type hint requires `Dict` instead of `dict` due to + # the conflicting ChunkManifest.dict method. + kerchunk_chunk_dict: Dict[ChunkKey, str | tuple[str] | tuple[str, int, int]], + ) -> "ChunkManifest": + chunk_entries: dict[ChunkKey, ChunkDictEntry] = {} + for k, v in kerchunk_chunk_dict.items(): + if isinstance(v, (str, bytes)): + raise NotImplementedError( + "Reading inlined reference data is currently not supported. [ToDo]" + ) + elif not isinstance(v, (tuple, list)): + raise TypeError(f"Unexpected type {type(v)} for chunk value: {v}") + chunk_entries[k] = ChunkEntry.from_kerchunk(v).dict() + return ChunkManifest(entries=chunk_entries) def rename_paths( self, @@ -358,6 +387,9 @@ def get_ndim_from_key(key: str) -> int: def validate_chunk_keys(chunk_keys: Iterable[ChunkKey]): + if not chunk_keys: + return + # Check if all keys have the correct form for key in chunk_keys: if not re.match(_CHUNK_KEY, key): diff --git a/virtualizarr/readers/__init__.py b/virtualizarr/readers/__init__.py new file mode 100644 index 00000000..0f83ba39 --- /dev/null +++ b/virtualizarr/readers/__init__.py @@ -0,0 +1,17 @@ +from virtualizarr.readers.dmrpp import DMRPPVirtualBackend +from virtualizarr.readers.fits import FITSVirtualBackend +from virtualizarr.readers.hdf5 import HDF5VirtualBackend +from virtualizarr.readers.kerchunk import KerchunkVirtualBackend +from virtualizarr.readers.netcdf3 import NetCDF3VirtualBackend +from virtualizarr.readers.tiff import TIFFVirtualBackend +from virtualizarr.readers.zarr_v3 import ZarrV3VirtualBackend + +__all__ = [ + "DMRPPVirtualBackend", + "FITSVirtualBackend", + "HDF5VirtualBackend", + "KerchunkVirtualBackend", + "NetCDF3VirtualBackend", + "TIFFVirtualBackend", + "ZarrV3VirtualBackend", +] diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py new file mode 100644 index 00000000..54aedfe2 --- /dev/null +++ b/virtualizarr/readers/common.py @@ -0,0 +1,195 @@ +import os +import warnings +from abc import ABC +from collections.abc import Iterable, Mapping, MutableMapping +from io import BufferedIOBase +from typing import ( + TYPE_CHECKING, + Any, + Hashable, + Optional, + cast, +) + +import xarray as xr +from xarray import Dataset +from xarray.backends import AbstractDataStore, BackendArray +from xarray.core.indexes import Index, PandasIndex +from xarray.core.variable import IndexVariable, Variable + +from virtualizarr.manifests import ManifestArray +from virtualizarr.utils import _FsspecFSFromFilepath + +XArrayOpenT = str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore + +if TYPE_CHECKING: + try: + from xarray import DataTree # type: ignore[attr-defined] + except ImportError: + DataTree = Any + + +class ManifestBackendArray(ManifestArray, BackendArray): + """Using this prevents xarray from wrapping the KerchunkArray in ExplicitIndexingAdapter etc.""" + + ... + + +def open_loadable_vars_and_indexes( + filepath: str, + loadable_variables, + reader_options, + drop_variables, + indexes, + group, + decode_times, +) -> tuple[Mapping[str, Variable], Mapping[str, Index]]: + """ + Open selected variables and indexes using xarray. + + Relies on xr.open_dataset and its auto-detection of filetypes to find the correct installed backend. + """ + + # TODO get rid of this if? + if indexes is None or len(loadable_variables) > 0: + # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... + # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references + # TODO really we probably want a dedicated xarray backend that iterates over all variables only once + fpath = _FsspecFSFromFilepath( + filepath=filepath, reader_options=reader_options + ).open_file() + + # fpath can be `Any` thanks to fsspec.filesystem(...).open() returning Any. + # We'll (hopefully safely) cast it to what xarray is expecting, but this might let errors through. + + ds = xr.open_dataset( + cast(XArrayOpenT, fpath), + drop_variables=drop_variables, + group=group, + decode_times=decode_times, + ) + + if indexes is None: + warnings.warn( + "Specifying `indexes=None` will create in-memory pandas indexes for each 1D coordinate, but concatenation of ManifestArrays backed by pandas indexes is not yet supported (see issue #18)." + "You almost certainly want to pass `indexes={}` to `open_virtual_dataset` instead." + ) + + # add default indexes by reading data from file + indexes = {name: index for name, index in ds.xindexes.items()} + elif indexes != {}: + # TODO allow manual specification of index objects + raise NotImplementedError() + else: + indexes = dict(**indexes) # for type hinting: to allow mutation + + # TODO we should drop these earlier by using drop_variables + loadable_vars = { + str(name): var + for name, var in ds.variables.items() + if name in loadable_variables + } + + # if we only read the indexes we can just close the file right away as nothing is lazy + if loadable_vars == {}: + ds.close() + else: + loadable_vars = {} + indexes = {} + + return loadable_vars, indexes + + +def construct_virtual_dataset( + virtual_vars, + loadable_vars, + indexes, + coord_names, + attrs, +) -> Dataset: + """Construct a virtual Datset from consistuent parts.""" + + vars = {**virtual_vars, **loadable_vars} + + data_vars, coords = separate_coords(vars, indexes, coord_names) + + vds = xr.Dataset( + data_vars, + coords=coords, + # indexes={}, # TODO should be added in a later version of xarray + attrs=attrs, + ) + + # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened + + return vds + + +def separate_coords( + vars: Mapping[str, xr.Variable], + indexes: MutableMapping[str, Index], + coord_names: Iterable[str] | None = None, +) -> tuple[dict[str, xr.Variable], xr.Coordinates]: + """ + Try to generate a set of coordinates that won't cause xarray to automatically build a pandas.Index for the 1D coordinates. + + Currently requires this function as a workaround unless xarray PR #8124 is merged. + + Will also preserve any loaded variables and indexes it is passed. + """ + + if coord_names is None: + coord_names = [] + + # split data and coordinate variables (promote dimension coordinates) + data_vars = {} + coord_vars: dict[ + str, tuple[Hashable, Any, dict[Any, Any], dict[Any, Any]] | xr.Variable + ] = {} + for name, var in vars.items(): + if name in coord_names or var.dims == (name,): + # use workaround to avoid creating IndexVariables described here https://github.com/pydata/xarray/pull/8107#discussion_r1311214263 + if len(var.dims) == 1: + dim1d, *_ = var.dims + coord_vars[name] = (dim1d, var.data, var.attrs, var.encoding) + + if isinstance(var, IndexVariable): + # unless variable actually already is a loaded IndexVariable, + # in which case we need to keep it and add the corresponding indexes explicitly + coord_vars[str(name)] = var + # TODO this seems suspect - will it handle datetimes? + indexes[name] = PandasIndex(var, dim1d) + else: + coord_vars[name] = var + else: + data_vars[name] = var + + coords = xr.Coordinates(coord_vars, indexes=indexes) + + return data_vars, coords + + +class VirtualBackend(ABC): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + raise NotImplementedError() + + @staticmethod + def open_virtual_datatree( + path: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> "DataTree": + raise NotImplementedError() diff --git a/virtualizarr/readers/dmrpp.py b/virtualizarr/readers/dmrpp.py new file mode 100644 index 00000000..766b1c62 --- /dev/null +++ b/virtualizarr/readers/dmrpp.py @@ -0,0 +1,718 @@ +import os +import warnings +from collections import defaultdict +from collections.abc import Mapping +from typing import Any, Iterable, Optional +from xml.etree import ElementTree as ET + +import numpy as np +from xarray import Coordinates, Dataset +from xarray.core.indexes import Index +from xarray.core.variable import Variable + +from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.readers.common import VirtualBackend +from virtualizarr.types import ChunkKey +from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions +from virtualizarr.zarr import ZArray + + +class DMRPPVirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + loadable_variables, drop_variables = check_for_collisions( + drop_variables=drop_variables, + loadable_variables=loadable_variables, + ) + + if loadable_variables != [] or decode_times or indexes is None: + raise NotImplementedError( + "Specifying `loadable_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files." + ) + + if group: + raise NotImplementedError() + + fpath = _FsspecFSFromFilepath( + filepath=filepath, reader_options=reader_options + ).open_file() + + parser = DMRParser(fpath.read(), data_filepath=filepath.strip(".dmrpp")) + vds = parser.parse_dataset() + + return vds.drop_vars(drop_variables) + + +class DMRParser: + """ + Parser for the OPeNDAP DMR++ XML format. + Reads groups, dimensions, coordinates, data variables, encoding, chunk manifests, and attributes. + Highly modular to allow support for older dmrpp schema versions. Includes many utility functions to extract + different information such as finding all variable tags, splitting hdf5 groups, parsing dimensions, and more. + + OPeNDAP DMR++ homepage: https://docs.opendap.org/index.php/DMR%2B%2B + """ + + # DAP and DMRPP XML namespaces + _ns = { + "dap": "http://xml.opendap.org/ns/DAP/4.0#", + "dmr": "http://xml.opendap.org/dap/dmrpp/1.0.0#", + } + # DAP data types to numpy data types + _dap_np_dtype = { + "Byte": "uint8", + "UByte": "uint8", + "Int8": "int8", + "UInt8": "uint8", + "Int16": "int16", + "UInt16": "uint16", + "Int32": "int32", + "UInt32": "uint32", + "Int64": "int64", + "UInt64": "uint64", + "Url": "object", + "Float32": "float32", + "Float64": "float64", + "String": "object", + } + # Default zlib compression value + _default_zlib_value = 6 + # Encoding keys that should be removed from attributes and placed in xarray encoding dict + _encoding_keys = {"_FillValue", "missing_value", "scale_factor", "add_offset"} + + def __init__(self, dmr: str, data_filepath: Optional[str] = None): + """ + Initialize the DMRParser with the given DMR data and data file path. + + Parameters + ---------- + dmr : str + The DMR file contents as a string. + + data_filepath : str, optional + The path to the actual data file that will be set in the chunk manifests. + If None, the data file path is taken from the DMR file. + """ + self.root = ET.fromstring(dmr) + self.data_filepath = ( + data_filepath if data_filepath is not None else self.root.attrib["name"] + ) + + def parse_dataset(self, group=None, indexes: Mapping[str, Index] = {}) -> Dataset: + """ + Parses the given file and creates a virtual xr.Dataset with ManifestArrays. + + Parameters + ---------- + group : str + The group to parse. If None, and no groups are present, the dataset is parsed. + If None and groups are present, the first group is parsed. + + indexes : Mapping[str, Index], default is {} + Indexes to use on the returned xarray Dataset. + Default is {} which will avoid creating any indexes + + Returns + ------- + An xr.Dataset wrapping virtualized zarr arrays. + + Examples + -------- + Open a sample DMR++ file and parse the dataset + + >>> import requests + >>> r = requests.get("https://github.com/OPENDAP/bes/raw/3e518f6dc2f625b0b83cfb6e6fd5275e4d6dcef1/modules/dmrpp_module/data/dmrpp/chunked_threeD.h5.dmrpp") + >>> parser = DMRParser(r.text) + >>> vds = parser.parse_dataset() + >>> vds + Size: 4MB + Dimensions: (phony_dim_0: 100, phony_dim_1: 100, phony_dim_2: 100) + Dimensions without coordinates: phony_dim_0, phony_dim_1, phony_dim_2 + Data variables: + d_8_chunks (phony_dim_0, phony_dim_1, phony_dim_2) float32 4MB ManifestA... + + >>> vds2 = open_virtual_dataset("https://github.com/OPENDAP/bes/raw/3e518f6dc2f625b0b83cfb6e6fd5275e4d6dcef1/modules/dmrpp_module/data/dmrpp/chunked_threeD.h5.dmrpp", filetype="dmrpp", indexes={}) + >>> vds2 + Size: 4MB + Dimensions: (phony_dim_0: 100, phony_dim_1: 100, phony_dim_2: 100) + Dimensions without coordinates: phony_dim_0, phony_dim_1, phony_dim_2 + Data variables: + d_8_chunks (phony_dim_0, phony_dim_1, phony_dim_2) float32 4MB ManifestA... + """ + if group is not None: + # group = "/" + group.strip("/") # ensure group is in form "/a/b" + group = os.path.normpath(group).removeprefix( + "/" + ) # ensure group is in form "a/b/c" + if self._is_hdf5(self.root): + return self._parse_hdf5_dataset(self.root, group, indexes) + if self.data_filepath.endswith(".nc"): + return self._parse_netcdf4_dataset(self.root, group, indexes) + raise ValueError("DMR file must be HDF5 or netCDF4 based") + + def _parse_netcdf4_dataset( + self, + root: ET.Element, + group: Optional[str] = None, + indexes: Mapping[str, Index] = {}, + ) -> Dataset: + """ + Parse the dataset from the netcdf4 based dmrpp with groups, starting at the given group. + Set root to the given group. + + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + + group : str + The group to parse. If None, and no groups are present, the dataset is parsed. + If None and groups are present, the first group is parsed. + + Returns + ------- + xr.Dataset + """ + group_tags = root.findall("dap:Group", self._ns) + if len(group_tags) == 0: + if group is not None: + # no groups found and group specified -> warning + warnings.warn( + "No groups found in NetCDF4 DMR file; ignoring group parameter" + ) + # no groups found and no group specified -> parse dataset + return self._parse_dataset(root, indexes) + all_groups = self._split_netcdf4(root) + if group is None: + # groups found and no group specified -> parse first group + return self._parse_dataset(group_tags[0], indexes) + if group in all_groups: + # groups found and group specified -> parse specified group + return self._parse_dataset(all_groups[group], indexes) + else: + # groups found and specified group not found -> error + raise ValueError(f"Group {group} not found in NetCDF4 DMR file") + + def _split_netcdf4(self, root: ET.Element) -> dict[str, ET.Element]: + """ + Split the input element into several ET.Elements by netcdf4 group + E.g. {"left": , "right": } + + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + + Returns + ------- + dict[str, ET.Element] + """ + group_tags = root.findall("dap:Group", self._ns) + all_groups: dict[str, ET.Element] = defaultdict( + lambda: ET.Element(root.tag, root.attrib) + ) + for group_tag in group_tags: + all_groups[os.path.normpath(group_tag.attrib["name"])] = group_tag + return all_groups + + def _is_hdf5(self, root: ET.Element) -> bool: + """Check if the DMR file is HDF5 based.""" + if root.find(".//dap:Attribute[@name='fullnamepath']", self._ns) is not None: + return True + if root.find("./dap:Attribute[@name='HDF5_GLOBAL']", self._ns) is not None: + return True + return False + + def _parse_hdf5_dataset( + self, + root: ET.Element, + group: Optional[str] = None, + indexes: Mapping[str, Index] = {}, + ) -> Dataset: + """ + Parse the dataset from the HDF5 based dmrpp with groups, starting at the given group. + Set root to the given group. + + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + + group : str + The group to parse. If None, and no groups are present, the dataset is parsed. + If None and groups are present, the first group is parsed. + + indexes : Mapping[str, Index], default is {} + Indexes to use on the returned xarray Dataset. + Default is {} which will avoid creating any indexes + + Returns + ------- + xr.Dataset + """ + all_groups = self._split_hdf5(root=root) + if len(all_groups) == 0: + raise ValueError("No groups found in HDF based dmrpp file") + if group is None: + # pick a random group if no group is specified + group = next(iter(all_groups)) + attrs = {} + for attr_tag in root.iterfind("dap:Attribute", self._ns): + if attr_tag.attrib["type"] != "Container": + attrs.update(self._parse_attribute(attr_tag)) + if group in all_groups: + # replace aliased variable names with original names: gt1r_heights -> heights + orignames = self._find_original_names(all_groups[group]) + vds = self._parse_dataset(all_groups[group], indexes) + # Only one group so found attrs are global attrs + if len(all_groups) == 1: + vds.attrs.update(attrs) + return vds.rename(orignames) + raise ValueError(f"Group {group} not found in HDF5 dmrpp file") + + def _find_original_names(self, root: ET.Element) -> dict[str, str]: + """ + Find the original variable names from the HDF based groups. E.g. gt1r_heights -> heights + + E.g. if the variable name is 'gt1r_heights', the original name is 'heights' from the group 'gt1r'. + + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + + Returns + ------- + dict[str, str] + """ + + orignames: dict[str, str] = {} + vars_tags: list[ET.Element] = [] + for dap_dtype in self._dap_np_dtype: + vars_tags += root.findall(f"dap:{dap_dtype}", self._ns) + for var_tag in vars_tags: + origname_tag = var_tag.find( + "./dap:Attribute[@name='origname']/dap:Value", self._ns + ) + if origname_tag is not None and origname_tag.text is not None: + orignames[var_tag.attrib["name"]] = origname_tag.text + return orignames + + def _split_hdf5(self, root: ET.Element) -> dict[str, ET.Element]: + """ + Split the input element into several ET.Elements by HDF5 group + E.g. {"gtr1/heights": , "gtr1/temperatures": }. Builds up new elements + each with dimensions, variables, and attributes. + + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + + Returns + ------- + dict[str, ET.Element] + """ + # Add all variable, dimension, and attribute tags to their respective groups + groups_roots: dict[str, ET.Element] = defaultdict( + lambda: ET.Element(root.tag, root.attrib) + ) + group_dims: dict[str, set[str]] = defaultdict( + set + ) # {"gt1r/heights": {"dim1", "dim2", ...}} + vars_tags: list[ET.Element] = [] + for dap_dtype in self._dap_np_dtype: + vars_tags += root.findall(f"dap:{dap_dtype}", self._ns) + # Variables + for var_tag in vars_tags: + fullname_tag = var_tag.find( + "./dap:Attribute[@name='fullnamepath']/dap:Value", self._ns + ) + if fullname_tag is not None and fullname_tag.text is not None: + # '/gt1r/heights/ph_id_pulse' -> 'gt1r/heights' + group_name = os.path.dirname(fullname_tag.text).removeprefix("/") + groups_roots[group_name].append(var_tag) + dim_tags = var_tag.findall("dap:Dim", self._ns) + dims = self._parse_multi_dims(dim_tags) + group_dims[group_name].update(dims.keys()) + # Dimensions + for dim_tag in root.iterfind("dap:Dimension", self._ns): + for g, d in group_dims.items(): + if dim_tag.attrib["name"] in d: + groups_roots[g].append(dim_tag) + # Attributes + container_attr_tag = root.find("dap:Attribute[@name='HDF5_GLOBAL']", self._ns) + if container_attr_tag is None: + attrs_tags = root.findall("dap:Attribute", self._ns) + for attr_tag in attrs_tags: + fullname_tag = attr_tag.find( + "./dap:Attribute[@name='fullnamepath']/dap:Value", self._ns + ) + if fullname_tag is not None and fullname_tag.text is not None: + group_name = os.path.dirname(fullname_tag.text).removeprefix("/") + # Add all attributes to the new dataset + groups_roots[group_name].extend(attr_tag) + else: + groups_roots[next(iter(groups_roots))].extend(container_attr_tag) + return groups_roots + + def _parse_dataset( + self, root: ET.Element, indexes: Mapping[str, Index] = {} + ) -> Dataset: + """ + Parse the dataset using the root element of the DMR file. + + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + + Returns + ------- + xr.Dataset + """ + # Dimension names and sizes + dim_tags = root.findall("dap:Dimension", self._ns) + dataset_dims = self._parse_multi_dims(dim_tags) + # Data variables and coordinates + coord_names = self._find_coord_names(root) + # if no coord_names are found or coords don't include dims, dims are used as coords + if len(coord_names) == 0 or len(coord_names) < len(dataset_dims): + coord_names = set(dataset_dims.keys()) + # Seperate and parse coords + data variables + coord_vars: dict[str, Variable] = {} + data_vars: dict[str, Variable] = {} + for var_tag in self._find_var_tags(root): + variable = self._parse_variable(var_tag, dataset_dims) + if var_tag.attrib["name"] in coord_names: + coord_vars[var_tag.attrib["name"]] = variable + else: + data_vars[var_tag.attrib["name"]] = variable + # Attributes + attrs: dict[str, str] = {} + for attr_tag in self.root.iterfind("dap:Attribute", self._ns): + attrs.update(self._parse_attribute(attr_tag)) + return Dataset( + data_vars=data_vars, + coords=Coordinates(coords=coord_vars, indexes=indexes), + attrs=attrs, + ) + + def _find_var_tags(self, root: ET.Element) -> list[ET.Element]: + """ + Find all variable tags in the DMR file. Also known as array tags. + Tags are labeled with the DAP data type. E.g. , , + + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + + Returns + ------- + list[ET.Element] + """ + vars_tags: list[ET.Element] = [] + for dap_dtype in self._dap_np_dtype: + vars_tags += root.findall(f"dap:{dap_dtype}", self._ns) + return vars_tags + + def _find_coord_names(self, root: ET.Element) -> set[str]: + """ + Find the name of all coordinates in root. Checks inside all variables and global attributes. + + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + + Returns + ------- + set[str] : The set of unique coordinate names. + """ + # Check for coordinate names within each variable attributes + coord_names: set[str] = set() + for var_tag in self._find_var_tags(root): + coord_tag = var_tag.find( + "./dap:Attribute[@name='coordinates']/dap:Value", self._ns + ) + if coord_tag is not None and coord_tag.text is not None: + coord_names.update(coord_tag.text.split(" ")) + for map_tag in var_tag.iterfind("dap:Map", self._ns): + coord_names.add(map_tag.attrib["name"].removeprefix("/")) + # Check for coordinate names in a global attribute + coord_tag = var_tag.find("./dap:Attribute[@name='coordinates']", self._ns) + if coord_tag is not None and coord_tag.text is not None: + coord_names.update(coord_tag.text.split(" ")) + return coord_names + + def _parse_dim(self, root: ET.Element) -> dict[str, int | None]: + """ + Parse single or tag + + If the tag has no name attribute, it is a phony dimension. E.g. --> {"phony_dim": 300} + If the tag has no size attribute, it is an unlimited dimension. E.g. --> {"time": None} + If the tag has both name and size attributes, it is a regular dimension. E.g. --> {"lat": 1447} + + Parameters + ---------- + root : ET.Element + The root element Dim/Dimension tag + + Returns + ------- + dict + E.g. {"time": 1, "lat": 1447, "lon": 2895}, {"phony_dim": 300}, {"time": None, "lat": None, "lon": None} + """ + if "name" not in root.attrib and "size" in root.attrib: + return {"phony_dim": int(root.attrib["size"])} + if "name" in root.attrib and "size" not in root.attrib: + return {os.path.basename(root.attrib["name"]): None} + if "name" in root.attrib and "size" in root.attrib: + return {os.path.basename(root.attrib["name"]): int(root.attrib["size"])} + raise ValueError("Not enough information to parse Dim/Dimension tag") + + def _parse_multi_dims( + self, dim_tags: list[ET.Element], global_dims: dict[str, int] = {} + ) -> dict: + """ + Parse multiple or tags. Generally tags are found within dmrpp variable tags. + + Returns best possible matching of {dimension: shape} present in the list and global_dims. E.g tags=(Dim("lat", None), Dim("lon", None)) and global_dims={"lat": 100, "lon": 100, "time": 5} --> {"lat": 100, "lon": 100} + + E.g. tags=(Dim("time", None), Dim("", 200)) and global_dims={"lat": 100, "lon": 100, "time": 5} --> {"time": 5, "phony_dim0": 200} + + This function is often used to fill in missing sizes from the global_dims. E.g. Variable tags may contain only dimension names and not sizes. If the {name: size} matching is known from the global_dims, it is used to fill in the missing sizes. + + Parameters + ---------- + dim_tags : tuple[ET.Element] + A tuple of ElementTree Elements representing dimensions in the DMR file. + + global_dims : dict + A dictionary of dimension names and sizes. E.g. {"time": 1, "lat": 1447, "lon": 2895} + + Returns + ------- + dict + E.g. {"time": 1, "lat": 1447, "lon": 2895} + """ + dims: dict[str, int | None] = {} + for dim_tag in dim_tags: + dim: dict[str, int | None] = self._parse_dim(dim_tag) + if "phony_dim" in dim: + dims["phony_dim_" + str(len(dims))] = dim["phony_dim"] + else: + dims.update(dim) + for name, size in list(dims.items()): + if name in global_dims and size is None: + dims[name] = global_dims[name] + return dims + + def _parse_variable( + self, var_tag: ET.Element, dataset_dims: dict[str, int] + ) -> Variable: + """ + Parse a variable from a DMR tag. + + Parameters + ---------- + var_tag : ET.Element + An ElementTree Element representing a variable in the DMR file. Will have DAP dtype as tag. + + dataset_dims : dict + A dictionary of dimension names and sizes. E.g. {"time": 1, "lat": 1447, "lon": 2895} + Must contain at least all the dimensions used by the variable. Necessary since the variable + metadata only contains the dimension names and not the sizes. + + Returns + ------- + xr.Variable + """ + # Dimension names + dim_tags = var_tag.findall("dap:Dim", self._ns) + dim_shapes = self._parse_multi_dims(dim_tags, dataset_dims) + # convert DAP dtype to numpy dtype + dtype = np.dtype( + self._dap_np_dtype[var_tag.tag.removeprefix("{" + self._ns["dap"] + "}")] + ) + # Chunks and Filters + filters = None + shape: tuple[int, ...] = tuple(dim_shapes.values()) + chunks_shape = shape + chunks_tag = var_tag.find("dmr:chunks", self._ns) + if chunks_tag is not None: + # Chunks + found_chunk_dims = self._parse_chunks_dimensions(chunks_tag) + chunks_shape = found_chunk_dims if found_chunk_dims is not None else shape + chunkmanifest = self._parse_chunks(chunks_tag, chunks_shape) + # Filters + filters = self._parse_filters(chunks_tag, dtype) + # Attributes + attrs: dict[str, Any] = {} + for attr_tag in var_tag.iterfind("dap:Attribute", self._ns): + attrs.update(self._parse_attribute(attr_tag)) + # Fill value is placed in encoding and thus removed from attributes + fill_value = attrs.pop("_FillValue", 0.0) + # Remove attributes only used for parsing logic + attrs.pop("fullnamepath", None) + attrs.pop("origname", None) + attrs.pop("coordinates", None) + # create ManifestArray and ZArray + zarray = ZArray( + chunks=chunks_shape, + dtype=dtype, + fill_value=fill_value, + filters=filters, + order="C", + shape=shape, + ) + marr = ManifestArray(zarray=zarray, chunkmanifest=chunkmanifest) + encoding = {k: attrs.get(k) for k in self._encoding_keys if k in attrs} + return Variable( + dims=dim_shapes.keys(), data=marr, attrs=attrs, encoding=encoding + ) + + def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]: + """ + Parse an attribute from a DMR attr tag. Converts the attribute value to a native python type. + + Parameters + ---------- + attr_tag : ET.Element + An ElementTree Element with an tag. + + Returns + ------- + dict + """ + attr: dict[str, Any] = {} + values = [] + if "type" in attr_tag.attrib and attr_tag.attrib["type"] == "Container": + return attr + dtype = np.dtype(self._dap_np_dtype[attr_tag.attrib["type"]]) + # if multiple Value tags are present, store as "key": "[v1, v2, ...]" + for value_tag in attr_tag: + # cast attribute to native python type using dmr provided dtype + val = ( + dtype.type(value_tag.text).item() + if dtype != np.object_ + else value_tag.text + ) + if val == "*": + val = np.nan + values.append(val) + attr[attr_tag.attrib["name"]] = values[0] if len(values) == 1 else values + return attr + + def _parse_filters( + self, chunks_tag: ET.Element, dtype: np.dtype + ) -> list[dict] | None: + """ + Parse filters from a DMR chunks tag. + + Parameters + ---------- + chunks_tag : ET.Element + An ElementTree Element with a tag. + + dtype : np.dtype + The numpy dtype of the variable. + + Returns + ------- + list[dict] | None + E.g. [{"id": "shuffle", "elementsize": 4}, {"id": "zlib", "level": 4}] + """ + if "compressionType" in chunks_tag.attrib: + filters: list[dict] = [] + # shuffle deflate --> ["shuffle", "deflate"] + compression_types = chunks_tag.attrib["compressionType"].split(" ") + for c in compression_types: + if c == "shuffle": + filters.append({"id": "shuffle", "elementsize": dtype.itemsize}) + elif c == "deflate": + filters.append( + { + "id": "zlib", + "level": int( + chunks_tag.attrib.get( + "deflateLevel", self._default_zlib_value + ) + ), + } + ) + return filters + return None + + def _parse_chunks_dimensions( + self, chunks_tag: ET.Element + ) -> tuple[int, ...] | None: + """ + Parse the chunk dimensions from a DMR chunks tag. Returns None if no chunk dimensions are found. + + Parameters + ---------- + chunks_tag : ET.Element + An ElementTree Element with a tag. + + Returns + ------- + tuple[int, ...] | None + + """ + chunk_dim_tag = chunks_tag.find("dmr:chunkDimensionSizes", self._ns) + if chunk_dim_tag is not None and chunk_dim_tag.text is not None: + # 1 1447 2895 -> (1, 1447, 2895) + return tuple(map(int, chunk_dim_tag.text.split())) + return None + + def _parse_chunks( + self, chunks_tag: ET.Element, chunks_shape: tuple[int, ...] + ) -> ChunkManifest: + """ + Parse the chunk manifest from a DMR chunks tag. + + Parameters + ---------- + chunks_tag : ET.Element + An ElementTree Element with a tag. + + chunks_shape : tuple + Chunk sizes for each dimension. E.g. (1, 1447, 2895) + + Returns + ------- + ChunkManifest + """ + chunkmanifest: dict[ChunkKey, object] = {} + default_num: list[int] = ( + [0 for i in range(len(chunks_shape))] if chunks_shape else [0] + ) + chunk_key_template = ".".join(["{}" for i in range(len(default_num))]) + for chunk_tag in chunks_tag.iterfind("dmr:chunk", self._ns): + chunk_num = default_num + if "chunkPositionInArray" in chunk_tag.attrib: + # "[0,1023,10235]" -> ["0","1023","10235"] + chunk_pos = chunk_tag.attrib["chunkPositionInArray"][1:-1].split(",") + # [0,1023,10235] // [1, 1023, 2047] -> [0,1,5] + chunk_num = [ + int(chunk_pos[i]) // chunks_shape[i] + for i in range(len(chunks_shape)) + ] + # [0,1,5] -> "0.1.5" + chunk_key = ChunkKey(chunk_key_template.format(*chunk_num)) + chunkmanifest[chunk_key] = { + "path": self.data_filepath, + "offset": int(chunk_tag.attrib["offset"]), + "length": int(chunk_tag.attrib["nBytes"]), + } + return ChunkManifest(entries=chunkmanifest) diff --git a/virtualizarr/readers/fits.py b/virtualizarr/readers/fits.py new file mode 100644 index 00000000..618d81cd --- /dev/null +++ b/virtualizarr/readers/fits.py @@ -0,0 +1,59 @@ +from typing import Iterable, Mapping, Optional + +from xarray import Dataset +from xarray.core.indexes import Index + +from virtualizarr.readers.common import ( + VirtualBackend, + construct_virtual_dataset, + open_loadable_vars_and_indexes, +) +from virtualizarr.translators.kerchunk import ( + extract_group, + virtual_vars_and_metadata_from_kerchunk_refs, +) +from virtualizarr.types.kerchunk import KerchunkStoreRefs + + +class FITSVirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + from kerchunk.fits import process_file + + # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 + refs = KerchunkStoreRefs({"refs": process_file(filepath, **reader_options)}) + + refs = extract_group(refs, group) + + virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( + refs, + loadable_variables, + drop_variables, + ) + + # TODO this wouldn't work until you had an xarray backend for FITS installed + loadable_vars, indexes = open_loadable_vars_and_indexes( + filepath, + loadable_variables=loadable_variables, + reader_options=reader_options, + drop_variables=drop_variables, + indexes=indexes, + group=group, + decode_times=decode_times, + ) + + return construct_virtual_dataset( + virtual_vars=virtual_vars, + loadable_vars=loadable_vars, + indexes=indexes, + coord_names=coord_names, + attrs=attrs, + ) diff --git a/virtualizarr/readers/hdf5.py b/virtualizarr/readers/hdf5.py new file mode 100644 index 00000000..c0d38e20 --- /dev/null +++ b/virtualizarr/readers/hdf5.py @@ -0,0 +1,64 @@ +from typing import Iterable, Mapping, Optional + +from xarray import Dataset +from xarray.core.indexes import Index + +from virtualizarr.readers.common import ( + VirtualBackend, + construct_virtual_dataset, + open_loadable_vars_and_indexes, +) +from virtualizarr.translators.kerchunk import ( + extract_group, + virtual_vars_and_metadata_from_kerchunk_refs, +) +from virtualizarr.utils import check_for_collisions + + +class HDF5VirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + from kerchunk.hdf import SingleHdf5ToZarr + + drop_variables, loadable_variables = check_for_collisions( + drop_variables, + loadable_variables, + ) + + refs = SingleHdf5ToZarr( + filepath, inline_threshold=0, **reader_options + ).translate() + + refs = extract_group(refs, group) + + virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( + refs, + loadable_variables, + drop_variables, + ) + + loadable_vars, indexes = open_loadable_vars_and_indexes( + filepath, + loadable_variables=loadable_variables, + reader_options=reader_options, + drop_variables=drop_variables, + indexes=indexes, + group=group, + decode_times=decode_times, + ) + + return construct_virtual_dataset( + virtual_vars=virtual_vars, + loadable_vars=loadable_vars, + indexes=indexes, + coord_names=coord_names, + attrs=attrs, + ) diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py new file mode 100644 index 00000000..35fa4932 --- /dev/null +++ b/virtualizarr/readers/kerchunk.py @@ -0,0 +1,69 @@ +from typing import Iterable, Mapping, Optional + +import ujson +from xarray import Dataset +from xarray.core.indexes import Index + +from virtualizarr.readers.common import VirtualBackend +from virtualizarr.translators.kerchunk import dataset_from_kerchunk_refs +from virtualizarr.types.kerchunk import ( + KerchunkStoreRefs, +) +from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions + + +class KerchunkVirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + """Reads existing kerchunk references (in JSON or parquet) format.""" + + if group: + raise NotImplementedError() + + loadable_variables, drop_variables = check_for_collisions( + drop_variables=drop_variables, + loadable_variables=loadable_variables, + ) + + if loadable_variables or indexes or decode_times: + raise NotImplementedError() + + fs = _FsspecFSFromFilepath(filepath=filepath, reader_options=reader_options) + + # The kerchunk .parquet storage format isn't actually a parquet, but a directory that contains named parquets for each group/variable. + if fs.filepath.endswith("ref.parquet"): + from fsspec.implementations.reference import LazyReferenceMapper + + lrm = LazyReferenceMapper(filepath, fs.fs) + + # build reference dict from KV pairs in LazyReferenceMapper + # is there a better / more preformant way to extract this? + array_refs = {k: lrm[k] for k in lrm.keys()} + + full_reference = {"refs": array_refs} + + vds = dataset_from_kerchunk_refs(KerchunkStoreRefs(full_reference)) + + # JSON has no magic bytes, but the Kerchunk version 1 spec starts with 'version': + # https://fsspec.github.io/kerchunk/spec.html + elif fs.read_bytes(9).startswith(b'{"version'): + with fs.open_file() as of: + refs = ujson.load(of) + + vds = dataset_from_kerchunk_refs(KerchunkStoreRefs(refs)) + + else: + raise ValueError( + "The input Kerchunk reference did not seem to be in Kerchunk's JSON or Parquet spec: https://fsspec.github.io/kerchunk/spec.html. The Kerchunk format autodetection is quite flaky, so if your reference matches the Kerchunk spec feel free to open an issue: https://github.com/zarr-developers/VirtualiZarr/issues" + ) + + # TODO would be more efficient to drop these before converting them into ManifestArrays, i.e. drop them from the kerchunk refs dict + return vds.drop_vars(drop_variables) diff --git a/virtualizarr/readers/netcdf3.py b/virtualizarr/readers/netcdf3.py new file mode 100644 index 00000000..30c6746e --- /dev/null +++ b/virtualizarr/readers/netcdf3.py @@ -0,0 +1,62 @@ +from typing import Iterable, Mapping, Optional + +from xarray import Dataset +from xarray.core.indexes import Index + +from virtualizarr.readers.common import ( + VirtualBackend, + construct_virtual_dataset, + open_loadable_vars_and_indexes, +) +from virtualizarr.translators.kerchunk import ( + extract_group, + virtual_vars_and_metadata_from_kerchunk_refs, +) +from virtualizarr.utils import check_for_collisions + + +class NetCDF3VirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + from kerchunk.netCDF3 import NetCDF3ToZarr + + drop_variables, loadable_variables = check_for_collisions( + drop_variables, + loadable_variables, + ) + + refs = NetCDF3ToZarr(filepath, inline_threshold=0, **reader_options).translate() + + refs = extract_group(refs, group) + + virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( + refs, + loadable_variables, + drop_variables, + ) + + loadable_vars, indexes = open_loadable_vars_and_indexes( + filepath, + loadable_variables=loadable_variables, + reader_options=reader_options, + drop_variables=drop_variables, + indexes=indexes, + group=group, + decode_times=decode_times, + ) + + return construct_virtual_dataset( + virtual_vars=virtual_vars, + loadable_vars=loadable_vars, + indexes=indexes, + coord_names=coord_names, + attrs=attrs, + ) diff --git a/virtualizarr/readers/tiff.py b/virtualizarr/readers/tiff.py new file mode 100644 index 00000000..bb32e647 --- /dev/null +++ b/virtualizarr/readers/tiff.py @@ -0,0 +1,73 @@ +import warnings +from typing import Iterable, Mapping, Optional + +from xarray import Dataset +from xarray.core.indexes import Index + +from virtualizarr.readers.common import ( + VirtualBackend, + construct_virtual_dataset, + open_loadable_vars_and_indexes, +) +from virtualizarr.translators.kerchunk import ( + extract_group, + virtual_vars_and_metadata_from_kerchunk_refs, +) +from virtualizarr.types.kerchunk import KerchunkStoreRefs +from virtualizarr.utils import check_for_collisions + + +class TIFFVirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + from kerchunk.tiff import tiff_to_zarr + + drop_variables, loadable_variables = check_for_collisions( + drop_variables=drop_variables, loadable_variables=loadable_variables + ) + + if reader_options is None: + reader_options = {} + + reader_options.pop("storage_options", {}) + warnings.warn( + "storage_options have been dropped from reader_options as they are not supported by kerchunk.tiff.tiff_to_zarr", + UserWarning, + ) + + # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 + refs = KerchunkStoreRefs({"refs": tiff_to_zarr(filepath, **reader_options)}) + + refs = extract_group(refs, group) + + virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( + refs, + loadable_variables, + drop_variables, + ) + + loadable_vars, indexes = open_loadable_vars_and_indexes( + filepath, + loadable_variables=loadable_variables, + reader_options=reader_options, + drop_variables=drop_variables, + indexes=indexes, + group=group, + decode_times=decode_times, + ) + + return construct_virtual_dataset( + virtual_vars=virtual_vars, + loadable_vars=loadable_vars, + indexes=indexes, + coord_names=coord_names, + attrs=attrs, + ) diff --git a/virtualizarr/readers/zarr_v3.py b/virtualizarr/readers/zarr_v3.py new file mode 100644 index 00000000..6da81581 --- /dev/null +++ b/virtualizarr/readers/zarr_v3.py @@ -0,0 +1,154 @@ +import json +from pathlib import Path +from typing import Iterable, Mapping, Optional + +import numcodecs +import numpy as np +from xarray import Dataset +from xarray.core.indexes import Index +from xarray.core.variable import Variable + +from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.readers.common import VirtualBackend, separate_coords +from virtualizarr.zarr import ZArray + + +class ZarrV3VirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + """ + Read a Zarr v3 store containing chunk manifests and return an xarray Dataset containing virtualized arrays. + + This is experimental - chunk manifests are not part of the Zarr v3 Spec. + """ + storepath = Path(filepath) + + if group: + raise NotImplementedError() + + if loadable_variables or decode_times: + raise NotImplementedError() + + if reader_options: + raise NotImplementedError() + + drop_vars: list[str] + if drop_variables is None: + drop_vars = [] + else: + drop_vars = list(drop_variables) + + ds_attrs = attrs_from_zarr_group_json(storepath / "zarr.json") + coord_names = ds_attrs.pop("coordinates", []) + + # TODO recursive glob to create a datatree + # Note: this .is_file() check should not be necessary according to the pathlib docs, but tests fail on github CI without it + # see https://github.com/TomNicholas/VirtualiZarr/pull/45#discussion_r1547833166 + all_paths = storepath.glob("*/") + directory_paths = [p for p in all_paths if not p.is_file()] + + vars = {} + for array_dir in directory_paths: + var_name = array_dir.name + if var_name in drop_vars: + break + + zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json") + manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json")) + + marr = ManifestArray(chunkmanifest=manifest, zarray=zarray) + var = Variable(data=marr, dims=dim_names, attrs=attrs) + vars[var_name] = var + + if indexes is None: + raise NotImplementedError() + elif indexes != {}: + # TODO allow manual specification of index objects + raise NotImplementedError() + else: + indexes = dict(**indexes) # for type hinting: to allow mutation + + data_vars, coords = separate_coords(vars, indexes, coord_names) + + ds = Dataset( + data_vars, + coords=coords, + # indexes={}, # TODO should be added in a later version of xarray + attrs=ds_attrs, + ) + + return ds + + +def attrs_from_zarr_group_json(filepath: Path) -> dict: + with open(filepath) as metadata_file: + attrs = json.load(metadata_file) + return attrs["attributes"] + + +def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: + with open(filepath) as metadata_file: + metadata = json.load(metadata_file) + + if { + "name": "chunk-manifest-json", + "configuration": { + "manifest": "./manifest.json", + }, + } not in metadata.get("storage_transformers", []): + raise ValueError( + "Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP." + ) + + attrs = metadata.pop("attributes") + dim_names = metadata.pop("dimension_names") + + chunk_shape = tuple(metadata["chunk_grid"]["configuration"]["chunk_shape"]) + shape = tuple(metadata["shape"]) + zarr_format = metadata["zarr_format"] + + if metadata["fill_value"] is None: + raise ValueError( + "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" + ) + else: + fill_value = metadata["fill_value"] + + all_codecs = [ + codec + for codec in metadata["codecs"] + if codec["name"] not in ("transpose", "bytes") + ] + compressor, *filters = [ + _configurable_to_num_codec_config(_filter) for _filter in all_codecs + ] + zarray = ZArray( + chunks=chunk_shape, + compressor=compressor, + dtype=np.dtype(metadata["data_type"]), + fill_value=fill_value, + filters=filters or None, + order="C", + shape=shape, + zarr_format=zarr_format, + ) + + return zarray, dim_names, attrs + + +def _configurable_to_num_codec_config(configurable: dict) -> dict: + """ + Convert a zarr v3 configurable into a numcodecs codec. + """ + configurable_copy = configurable.copy() + codec_id = configurable_copy.pop("name") + configuration = configurable_copy.pop("configuration") + return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 3856a6ba..70f613ce 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -33,7 +33,9 @@ def _importorskip( has_astropy, requires_astropy = _importorskip("astropy") +has_kerchunk, requires_kerchunk = _importorskip("kerchunk") has_s3fs, requires_s3fs = _importorskip("s3fs") +has_scipy, requires_scipy = _importorskip("scipy") has_tifffile, requires_tifffile = _importorskip("tifffile") @@ -48,9 +50,9 @@ def create_manifestarray( zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "blosc", "clevel": 5, "cname": "lz4", "shuffle": 1}, dtype=np.dtype("float32"), - fill_value=0.0, # TODO change this to NaN? + fill_value=0.0, filters=None, order="C", shape=shape, diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py new file mode 100644 index 00000000..43a6bbd8 --- /dev/null +++ b/virtualizarr/tests/test_backend.py @@ -0,0 +1,427 @@ +from collections.abc import Mapping +from unittest.mock import patch + +import numpy as np +import pytest +import xarray as xr +import xarray.testing as xrt +from xarray import open_dataset +from xarray.core.indexes import Index + +from virtualizarr import open_virtual_dataset +from virtualizarr.backend import FileType, automatically_determine_filetype +from virtualizarr.manifests import ManifestArray +from virtualizarr.tests import ( + has_astropy, + has_tifffile, + network, + requires_kerchunk, + requires_s3fs, + requires_scipy, +) + + +@requires_scipy +def test_automatically_determine_filetype_netcdf3_netcdf4(): + # test the NetCDF3 vs NetCDF4 automatic file type selection + + ds = xr.Dataset({"a": (["x"], [0, 1])}) + netcdf3_file_path = "/tmp/netcdf3.nc" + netcdf4_file_path = "/tmp/netcdf4.nc" + + # write two version of NetCDF + ds.to_netcdf(netcdf3_file_path, engine="scipy", format="NETCDF3_CLASSIC") + ds.to_netcdf(netcdf4_file_path, engine="h5netcdf") + + assert FileType("netcdf3") == automatically_determine_filetype( + filepath=netcdf3_file_path + ) + assert FileType("hdf5") == automatically_determine_filetype( + filepath=netcdf4_file_path + ) + + +@pytest.mark.parametrize( + "filetype,headerbytes", + [ + ("netcdf3", b"CDF"), + ("hdf5", b"\x89HDF"), + ("grib", b"GRIB"), + ("tiff", b"II*"), + ("fits", b"SIMPLE"), + ], +) +def test_valid_filetype_bytes(tmp_path, filetype, headerbytes): + filepath = tmp_path / "file.abc" + with open(filepath, "wb") as f: + f.write(headerbytes) + assert FileType(filetype) == automatically_determine_filetype(filepath=filepath) + + +def test_notimplemented_filetype(tmp_path): + for headerbytes in [b"JUNK", b"\x0e\x03\x13\x01"]: + filepath = tmp_path / "file.abc" + with open(filepath, "wb") as f: + f.write(headerbytes) + with pytest.raises(NotImplementedError): + automatically_determine_filetype(filepath=filepath) + + +def test_FileType(): + # tests if FileType converts user supplied strings to correct filetype + assert "netcdf3" == FileType("netcdf3").name + assert "netcdf4" == FileType("netcdf4").name + assert "hdf4" == FileType("hdf4").name + assert "hdf5" == FileType("hdf5").name + assert "grib" == FileType("grib").name + assert "tiff" == FileType("tiff").name + assert "fits" == FileType("fits").name + assert "zarr" == FileType("zarr").name + with pytest.raises(ValueError): + FileType(None) + + +@requires_kerchunk +class TestOpenVirtualDatasetIndexes: + def test_no_indexes(self, netcdf4_file): + vds = open_virtual_dataset(netcdf4_file, indexes={}) + assert vds.indexes == {} + + def test_create_default_indexes(self, netcdf4_file): + with pytest.warns(UserWarning, match="will create in-memory pandas indexes"): + vds = open_virtual_dataset(netcdf4_file, indexes=None) + ds = open_dataset(netcdf4_file, decode_times=True) + + # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 + assert index_mappings_equal(vds.xindexes, ds.xindexes) + + +def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, Index]): + # Check if the mappings have the same keys + if set(indexes1.keys()) != set(indexes2.keys()): + return False + + # Check if the values for each key are identical + for key in indexes1.keys(): + index1 = indexes1[key] + index2 = indexes2[key] + + if not index1.equals(index2): + return False + + return True + + +@requires_kerchunk +def test_cftime_index(tmpdir): + """Ensure a virtual dataset contains the same indexes as an Xarray dataset""" + # Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168 + ds = xr.Dataset( + data_vars={ + "tasmax": (["time", "lat", "lon"], np.random.rand(2, 18, 36)), + }, + coords={ + "time": np.array(["2023-01-01", "2023-01-02"], dtype="datetime64[ns]"), + "lat": np.arange(-90, 90, 10), + "lon": np.arange(-180, 180, 10), + }, + attrs={"attr1_key": "attr1_val"}, + ) + ds.to_netcdf(f"{tmpdir}/tmp.nc") + vds = open_virtual_dataset( + f"{tmpdir}/tmp.nc", loadable_variables=["time", "lat", "lon"], indexes={} + ) + # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 + assert index_mappings_equal(vds.xindexes, ds.xindexes) + assert list(ds.coords) == list(vds.coords) + assert vds.dims == ds.dims + assert vds.attrs == ds.attrs + + +@requires_kerchunk +class TestOpenVirtualDatasetAttrs: + def test_drop_array_dimensions(self, netcdf4_file): + # regression test for GH issue #150 + vds = open_virtual_dataset(netcdf4_file, indexes={}) + assert "_ARRAY_DIMENSIONS" not in vds["air"].attrs + + def test_coordinate_variable_attrs_preserved(self, netcdf4_file): + # regression test for GH issue #155 + vds = open_virtual_dataset(netcdf4_file, indexes={}) + assert vds["lat"].attrs == { + "standard_name": "latitude", + "long_name": "Latitude", + "units": "degrees_north", + "axis": "Y", + } + + +@network +@requires_s3fs +class TestReadFromS3: + @pytest.mark.parametrize( + "filetype", ["netcdf4", None], ids=["netcdf4 filetype", "None filetype"] + ) + @pytest.mark.parametrize( + "indexes", [None, {}], ids=["None index", "empty dict index"] + ) + def test_anon_read_s3(self, filetype, indexes): + """Parameterized tests for empty vs supplied indexes and filetypes.""" + # TODO: Switch away from this s3 url after minIO is implemented. + fpath = "s3://carbonplan-share/virtualizarr/local.nc" + vds = open_virtual_dataset( + fpath, + filetype=filetype, + indexes=indexes, + reader_options={"storage_options": {"anon": True}}, + ) + + assert vds.dims == {"time": 2920, "lat": 25, "lon": 53} + for var in vds.variables: + assert isinstance(vds[var].data, ManifestArray), var + + +@network +class TestReadFromURL: + @pytest.mark.parametrize( + "filetype, url", + [ + ( + "grib", + "https://github.com/pydata/xarray-data/raw/master/era5-2mt-2019-03-uk.grib", + ), + ( + "netcdf3", + "https://github.com/pydata/xarray-data/raw/master/air_temperature.nc", + ), + ( + "netcdf4", + "https://github.com/pydata/xarray-data/raw/master/ROMS_example.nc", + ), + ( + "hdf4", + "https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf", + ), + ( + "hdf5", + "https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5", + ), + # https://github.com/zarr-developers/VirtualiZarr/issues/159 + # ("hdf5", "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5"), + pytest.param( + "tiff", + "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif", + marks=pytest.mark.skipif( + not has_tifffile, reason="package tifffile is not available" + ), + ), + pytest.param( + "fits", + "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits", + marks=pytest.mark.skipif( + not has_astropy, reason="package astropy is not available" + ), + ), + ( + "jpg", + "https://github.com/rasterio/rasterio/raw/main/tests/data/389225main_sw_1965_1024.jpg", + ), + ], + ) + def test_read_from_url(self, filetype, url): + if filetype in ["grib", "jpg", "hdf4"]: + with pytest.raises(NotImplementedError): + vds = open_virtual_dataset(url, reader_options={}, indexes={}) + elif filetype == "hdf5": + vds = open_virtual_dataset( + url, + group="science/LSAR/GCOV/grids/frequencyA", + drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], + indexes={}, + reader_options={}, + ) + assert isinstance(vds, xr.Dataset) + else: + vds = open_virtual_dataset(url, indexes={}) + assert isinstance(vds, xr.Dataset) + + def test_virtualizarr_vs_local_nisar(self): + import fsspec + + # Open group directly from locally cached file with xarray + url = "https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5" + tmpfile = fsspec.open_local( + f"filecache::{url}", filecache=dict(cache_storage="/tmp", same_names=True) + ) + hdf_group = "science/LSAR/GCOV/grids/frequencyA" + dsXR = xr.open_dataset( + tmpfile, + engine="h5netcdf", + group=hdf_group, + drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], + phony_dims="access", + ) + + # save group reference file via virtualizarr, then open with engine="kerchunk" + vds = open_virtual_dataset( + tmpfile, + group=hdf_group, + indexes={}, + drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], + ) + tmpref = "/tmp/cmip6.json" + vds.virtualize.to_kerchunk(tmpref, format="json") + dsV = xr.open_dataset(tmpref, engine="kerchunk") + + # xrt.assert_identical(dsXR, dsV) #Attribute order changes + xrt.assert_equal(dsXR, dsV) + + +@requires_kerchunk +class TestLoadVirtualDataset: + def test_loadable_variables(self, netcdf4_file): + vars_to_load = ["air", "time"] + vds = open_virtual_dataset( + netcdf4_file, loadable_variables=vars_to_load, indexes={} + ) + + for name in vds.variables: + if name in vars_to_load: + assert isinstance(vds[name].data, np.ndarray), name + else: + assert isinstance(vds[name].data, ManifestArray), name + + full_ds = xr.open_dataset(netcdf4_file, decode_times=True) + + for name in full_ds.variables: + if name in vars_to_load: + xrt.assert_identical(vds.variables[name], full_ds.variables[name]) + + def test_explicit_filetype(self, netcdf4_file): + with pytest.raises(ValueError): + open_virtual_dataset(netcdf4_file, filetype="unknown") + + with pytest.raises(NotImplementedError): + open_virtual_dataset(netcdf4_file, filetype="grib") + + def test_group_kwarg(self, hdf5_groups_file): + with pytest.raises(ValueError, match="Multiple HDF Groups found"): + open_virtual_dataset(hdf5_groups_file) + with pytest.raises(ValueError, match="not found in"): + open_virtual_dataset(hdf5_groups_file, group="doesnt_exist") + + vars_to_load = ["air", "time"] + vds = open_virtual_dataset( + hdf5_groups_file, + group="test/group", + loadable_variables=vars_to_load, + indexes={}, + ) + full_ds = xr.open_dataset( + hdf5_groups_file, + group="test/group", + ) + for name in full_ds.variables: + if name in vars_to_load: + xrt.assert_identical(vds.variables[name], full_ds.variables[name]) + + @pytest.mark.xfail(reason="patches a function which no longer exists") + @patch("virtualizarr.translators.kerchunk.read_kerchunk_references_from_file") + def test_open_virtual_dataset_passes_expected_args( + self, mock_read_kerchunk, netcdf4_file + ): + reader_options = {"option1": "value1", "option2": "value2"} + open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options) + args = { + "filepath": netcdf4_file, + "filetype": None, + "group": None, + "reader_options": reader_options, + } + mock_read_kerchunk.assert_called_once_with(**args) + + def test_open_dataset_with_empty(self, hdf5_empty, tmpdir): + vds = open_virtual_dataset(hdf5_empty) + assert vds.empty.dims == () + assert vds.empty.attrs == {"empty": "true"} + + def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir): + vds = open_virtual_dataset(hdf5_scalar) + assert vds.scalar.dims == () + assert vds.scalar.attrs == {"scalar": "true"} + + +@requires_kerchunk +@pytest.mark.parametrize( + "reference_format", + ["json", "parquet", "invalid"], +) +def test_open_virtual_dataset_existing_kerchunk_refs( + tmp_path, netcdf4_virtual_dataset, reference_format +): + example_reference_dict = netcdf4_virtual_dataset.virtualize.to_kerchunk( + format="dict" + ) + + if reference_format == "invalid": + # Test invalid file format leads to ValueError + ref_filepath = tmp_path / "ref.csv" + with open(ref_filepath.as_posix(), mode="w") as of: + of.write("tmp") + + with pytest.raises(ValueError): + open_virtual_dataset( + filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={} + ) + + else: + # Test valid json and parquet reference formats + + if reference_format == "json": + ref_filepath = tmp_path / "ref.json" + + import ujson + + with open(ref_filepath, "w") as json_file: + ujson.dump(example_reference_dict, json_file) + + if reference_format == "parquet": + from kerchunk.df import refs_to_dataframe + + ref_filepath = tmp_path / "ref.parquet" + refs_to_dataframe(fo=example_reference_dict, url=ref_filepath.as_posix()) + + vds = open_virtual_dataset( + filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={} + ) + + # Inconsistent results! https://github.com/TomNicholas/VirtualiZarr/pull/73#issuecomment-2040931202 + # assert vds.virtualize.to_kerchunk(format='dict') == example_reference_dict + refs = vds.virtualize.to_kerchunk(format="dict") + expected_refs = netcdf4_virtual_dataset.virtualize.to_kerchunk(format="dict") + assert refs["refs"]["air/0.0.0"] == expected_refs["refs"]["air/0.0.0"] + assert refs["refs"]["lon/0"] == expected_refs["refs"]["lon/0"] + assert refs["refs"]["lat/0"] == expected_refs["refs"]["lat/0"] + assert refs["refs"]["time/0"] == expected_refs["refs"]["time/0"] + + assert list(vds) == list(netcdf4_virtual_dataset) + assert set(vds.coords) == set(netcdf4_virtual_dataset.coords) + assert set(vds.variables) == set(netcdf4_virtual_dataset.variables) + + +@requires_kerchunk +def test_notimplemented_read_inline_refs(tmp_path, netcdf4_inlined_ref): + # For now, we raise a NotImplementedError if we read existing references that have inlined data + # https://github.com/zarr-developers/VirtualiZarr/pull/251#pullrequestreview-2361916932 + + ref_filepath = tmp_path / "ref.json" + + import ujson + + with open(ref_filepath, "w") as json_file: + ujson.dump(netcdf4_inlined_ref, json_file) + + with pytest.raises(NotImplementedError): + open_virtual_dataset( + filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={} + ) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 2e612de9..c9e3e302 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -4,8 +4,53 @@ import xarray.testing as xrt from virtualizarr import open_virtual_dataset +from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.tests import requires_kerchunk +from virtualizarr.translators.kerchunk import ( + dataset_from_kerchunk_refs, + find_var_names, +) +from virtualizarr.zarr import ZArray + + +def test_kerchunk_roundtrip_in_memory_no_concat(): + # Set up example xarray dataset + chunks_dict = { + "0.0": {"path": "foo.nc", "offset": 100, "length": 100}, + "0.1": {"path": "foo.nc", "offset": 200, "length": 100}, + } + manifest = ChunkManifest(entries=chunks_dict) + marr = ManifestArray( + zarray=dict( + shape=(2, 4), + dtype=np.dtype(" Dataset: + arr = ManifestArray( + chunkmanifest=ChunkManifest( + entries={"0.0": dict(path="test.nc", offset=6144, length=48)} + ), + zarray=dict( + shape=(2, 3), + dtype=np.dtype(" bool: + """ + Several metadata attributes in ZarrV3 use a dictionary with keys "name" : str and "configuration" : dict + """ + return "name" in value and "configuration" in value + + +def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: Dataset): + """ + Checks that the output metadata of an array variable conforms to this spec + for the required attributes: + https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#metadata + """ + dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr") + # read the a variable's metadata + with open(tmpdir / "store.zarr/a/zarr.json", mode="r") as f: + metadata = json.loads(f.read()) + assert metadata["zarr_format"] == 3 + assert metadata["node_type"] == "array" + assert isinstance(metadata["shape"], list) and all( + isinstance(dim, int) for dim in metadata["shape"] + ) + assert isinstance(metadata["data_type"], str) or isconfigurable( + metadata["data_type"] + ) + assert isconfigurable(metadata["chunk_grid"]) + assert isconfigurable(metadata["chunk_key_encoding"]) + assert isinstance(metadata["fill_value"], (bool, int, float, str, list)) + assert ( + isinstance(metadata["codecs"], list) + and len(metadata["codecs"]) > 1 + and all(isconfigurable(codec) for codec in metadata["codecs"]) + ) + + +def test_zarr_v3_roundtrip(tmpdir, vds_with_manifest_arrays: Dataset): + vds_with_manifest_arrays.virtualize.to_zarr(tmpdir / "store.zarr") + roundtrip = open_virtual_dataset( + tmpdir / "store.zarr", filetype=FileType.zarr_v3, indexes={} + ) + + xrt.assert_identical(roundtrip, vds_with_manifest_arrays) + + +def test_metadata_roundtrip(tmpdir, vds_with_manifest_arrays: Dataset): + dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr") + zarray, _, _ = metadata_from_zarr_json(tmpdir / "store.zarr/a/zarr.json") + assert zarray == vds_with_manifest_arrays.a.data.zarray diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index d0fe2e3b..062eda5f 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -1,15 +1,10 @@ -from collections.abc import Mapping -from unittest.mock import patch - import numpy as np import pytest import xarray as xr -import xarray.testing as xrt -from xarray.core.indexes import Index from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.tests import has_astropy, has_tifffile, network, requires_s3fs +from virtualizarr.tests import requires_kerchunk from virtualizarr.zarr import ZArray @@ -19,7 +14,7 @@ def test_wrapping(): dtype = np.dtype("int32") zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=dtype, fill_value=0.0, filters=None, @@ -49,7 +44,7 @@ def test_equals(self): shape = (5, 20) zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -86,7 +81,7 @@ def test_concat_along_existing_dim(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(1, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -133,7 +128,7 @@ def test_concat_along_new_dim(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(5, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -183,7 +178,7 @@ def test_concat_dim_coords_along_existing_dim(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(10,), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -228,53 +223,7 @@ def test_concat_dim_coords_along_existing_dim(self): assert result.data.zarray.zarr_format == zarray.zarr_format -class TestOpenVirtualDatasetAttrs: - def test_drop_array_dimensions(self, netcdf4_file): - # regression test for GH issue #150 - vds = open_virtual_dataset(netcdf4_file, indexes={}) - assert "_ARRAY_DIMENSIONS" not in vds["air"].attrs - - def test_coordinate_variable_attrs_preserved(self, netcdf4_file): - # regression test for GH issue #155 - vds = open_virtual_dataset(netcdf4_file, indexes={}) - assert vds["lat"].attrs == { - "standard_name": "latitude", - "long_name": "Latitude", - "units": "degrees_north", - "axis": "Y", - } - - -class TestOpenVirtualDatasetIndexes: - def test_no_indexes(self, netcdf4_file): - vds = open_virtual_dataset(netcdf4_file, indexes={}) - assert vds.indexes == {} - - def test_create_default_indexes(self, netcdf4_file): - with pytest.warns(UserWarning, match="will create in-memory pandas indexes"): - vds = open_virtual_dataset(netcdf4_file, indexes=None) - ds = xr.open_dataset(netcdf4_file, decode_times=False) - - # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 - assert index_mappings_equal(vds.xindexes, ds.xindexes) - - -def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, Index]): - # Check if the mappings have the same keys - if set(indexes1.keys()) != set(indexes2.keys()): - return False - - # Check if the values for each key are identical - for key in indexes1.keys(): - index1 = indexes1[key] - index2 = indexes2[key] - - if not index1.equals(index2): - return False - - return True - - +@requires_kerchunk class TestCombineUsingIndexes: def test_combine_by_coords(self, netcdf4_files): filepath1, filepath2 = netcdf4_files @@ -308,118 +257,7 @@ def test_combine_by_coords_keeping_manifestarrays(self, netcdf4_files): assert isinstance(combined_vds["lon"].data, ManifestArray) -@network -@requires_s3fs -class TestReadFromS3: - @pytest.mark.parametrize( - "filetype", ["netcdf4", None], ids=["netcdf4 filetype", "None filetype"] - ) - @pytest.mark.parametrize( - "indexes", [None, {}], ids=["None index", "empty dict index"] - ) - def test_anon_read_s3(self, filetype, indexes): - """Parameterized tests for empty vs supplied indexes and filetypes.""" - # TODO: Switch away from this s3 url after minIO is implemented. - fpath = "s3://carbonplan-share/virtualizarr/local.nc" - vds = open_virtual_dataset(fpath, filetype=filetype, indexes=indexes) - - assert vds.dims == {"time": 2920, "lat": 25, "lon": 53} - for var in vds.variables: - assert isinstance(vds[var].data, ManifestArray), var - - -@network -class TestReadFromURL: - @pytest.mark.parametrize( - "filetype, url", - [ - ( - "grib", - "https://github.com/pydata/xarray-data/raw/master/era5-2mt-2019-03-uk.grib", - ), - ( - "netcdf3", - "https://github.com/pydata/xarray-data/raw/master/air_temperature.nc", - ), - ( - "netcdf4", - "https://github.com/pydata/xarray-data/raw/master/ROMS_example.nc", - ), - ( - "hdf4", - "https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf", - ), - # https://github.com/zarr-developers/VirtualiZarr/issues/159 - # ("hdf5", "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5"), - pytest.param( - "tiff", - "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif", - marks=pytest.mark.skipif( - not has_tifffile, reason="package tifffile is not available" - ), - ), - pytest.param( - "fits", - "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits", - marks=pytest.mark.skipif( - not has_astropy, reason="package astropy is not available" - ), - ), - ( - "jpg", - "https://github.com/rasterio/rasterio/raw/main/tests/data/389225main_sw_1965_1024.jpg", - ), - ], - ) - def test_read_from_url(self, filetype, url): - if filetype in ["grib", "jpg", "hdf4"]: - with pytest.raises(NotImplementedError): - vds = open_virtual_dataset(url, reader_options={}, indexes={}) - else: - vds = open_virtual_dataset(url, reader_options={}, indexes={}) - assert isinstance(vds, xr.Dataset) - - -class TestLoadVirtualDataset: - def test_loadable_variables(self, netcdf4_file): - vars_to_load = ["air", "time"] - vds = open_virtual_dataset( - netcdf4_file, loadable_variables=vars_to_load, indexes={} - ) - - for name in vds.variables: - if name in vars_to_load: - assert isinstance(vds[name].data, np.ndarray), name - else: - assert isinstance(vds[name].data, ManifestArray), name - - full_ds = xr.open_dataset(netcdf4_file, decode_times=False) - - for name in full_ds.variables: - if name in vars_to_load: - xrt.assert_identical(vds.variables[name], full_ds.variables[name]) - - def test_explicit_filetype(self, netcdf4_file): - with pytest.raises(ValueError): - open_virtual_dataset(netcdf4_file, filetype="unknown") - - with pytest.raises(NotImplementedError): - open_virtual_dataset(netcdf4_file, filetype="grib") - - @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file") - def test_open_virtual_dataset_passes_expected_args( - self, mock_read_kerchunk, netcdf4_file - ): - reader_options = {"option1": "value1", "option2": "value2"} - open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options) - args = { - "filepath": netcdf4_file, - "filetype": None, - "reader_options": reader_options, - } - mock_read_kerchunk.assert_called_once_with(**args) - - +@requires_kerchunk class TestRenamePaths: def test_rename_to_str(self, netcdf4_file): vds = open_virtual_dataset(netcdf4_file, indexes={}) @@ -462,10 +300,3 @@ def test_mixture_of_manifestarrays_and_numpy_arrays(self, netcdf4_file): == "s3://bucket/air.nc" ) assert isinstance(renamed_vds["lat"].data, np.ndarray) - - -def test_cftime_variables_must_be_in_loadable_variables(tmpdir): - ds = xr.Dataset(data_vars={"time": ["2024-06-21"]}) - ds.to_netcdf(f"{tmpdir}/scalar.nc") - with pytest.raises(ValueError, match="'time' not in"): - open_virtual_dataset(f"{tmpdir}/scalar.nc", cftime_variables=["time"]) diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py index 80d04b9c..95dbf55f 100644 --- a/virtualizarr/tests/test_zarr.py +++ b/virtualizarr/tests/test_zarr.py @@ -1,32 +1,29 @@ import numpy as np -import xarray as xr -import xarray.testing as xrt -from virtualizarr import ManifestArray, open_virtual_dataset -from virtualizarr.manifests.manifest import ChunkManifest +from virtualizarr.zarr import ZArray -def test_zarr_v3_roundtrip(tmpdir): - arr = ManifestArray( - chunkmanifest=ChunkManifest( - entries={"0.0": dict(path="test.nc", offset=6144, length=48)} - ), - zarray=dict( - shape=(2, 3), - dtype=np.dtype(" tuple[Mapping[str, Variable], dict[str, Any], list[str]]: + """ + Parses all useful information from a set kerchunk references (for a single group). + """ + + virtual_vars = virtual_vars_from_kerchunk_refs( + vds_refs, + drop_variables=drop_variables + loadable_variables, + virtual_array_class=virtual_array_class, + ) + ds_attrs = fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) + coord_names = ds_attrs.pop("coordinates", []) + + return virtual_vars, ds_attrs, coord_names + + +def extract_group(vds_refs: KerchunkStoreRefs, group: str | None) -> KerchunkStoreRefs: + """Extract only the part of the kerchunk reference dict that is relevant to a single HDF group""" + hdf_groups = [ + k.removesuffix(".zgroup") for k in vds_refs["refs"].keys() if ".zgroup" in k + ] + if len(hdf_groups) == 1: + return vds_refs + else: + if group is None: + raise ValueError( + f"Multiple HDF Groups found. Must specify group= keyword to select one of {hdf_groups}" + ) + else: + # Ensure supplied group kwarg is consistent with kerchunk keys + if not group.endswith("/"): + group += "/" + if group.startswith("/"): + group = group.removeprefix("/") + + if group not in hdf_groups: + raise ValueError(f'Group "{group}" not found in {hdf_groups}') + + # Filter by group prefix and remove prefix from all keys + groupdict = { + k.removeprefix(group): v + for k, v in vds_refs["refs"].items() + if k.startswith(group) + } + # Also remove group prefix from _ARRAY_DIMENSIONS + for k, v in groupdict.items(): + if isinstance(v, str): + groupdict[k] = v.replace("\\/", "/").replace(group, "") + + vds_refs["refs"] = groupdict + + return KerchunkStoreRefs(vds_refs) + + +def virtual_vars_from_kerchunk_refs( + refs: KerchunkStoreRefs, + drop_variables: list[str] | None = None, + virtual_array_class=ManifestArray, +) -> dict[str, Variable]: + """ + Translate a store-level kerchunk reference dict into aaset of xarray Variables containing virtualized arrays. + + Parameters + ---------- + drop_variables: list[str], default is None + Variables in the file to drop before returning. + virtual_array_class + Virtual array class to use to represent the references to the chunks in each on-disk array. + Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. + """ + + var_names = find_var_names(refs) + if drop_variables is None: + drop_variables = [] + var_names_to_keep = [ + var_name for var_name in var_names if var_name not in drop_variables + ] + + vars = { + var_name: variable_from_kerchunk_refs(refs, var_name, virtual_array_class) + for var_name in var_names_to_keep + } + return vars + + +def dataset_from_kerchunk_refs( + refs: KerchunkStoreRefs, + drop_variables: list[str] = [], + virtual_array_class: type = ManifestArray, + indexes: MutableMapping[str, Index] | None = None, +) -> Dataset: + """ + Translate a store-level kerchunk reference dict into an xarray Dataset containing virtualized arrays. + + drop_variables: list[str], default is None + Variables in the file to drop before returning. + virtual_array_class + Virtual array class to use to represent the references to the chunks in each on-disk array. + Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. + """ + + vars = virtual_vars_from_kerchunk_refs(refs, drop_variables, virtual_array_class) + ds_attrs = fully_decode_arr_refs(refs["refs"]).get(".zattrs", {}) + coord_names = ds_attrs.pop("coordinates", []) + + if indexes is None: + indexes = {} + data_vars, coords = separate_coords(vars, indexes, coord_names) + + vds = Dataset( + data_vars, + coords=coords, + # indexes={}, # TODO should be added in a later version of xarray + attrs=ds_attrs, + ) + + return vds + + +def variable_from_kerchunk_refs( + refs: KerchunkStoreRefs, var_name: str, virtual_array_class +) -> Variable: + """Create a single xarray Variable by reading specific keys of a kerchunk references dict.""" + + arr_refs = extract_array_refs(refs, var_name) + chunk_dict, zarray, zattrs = parse_array_refs(arr_refs) + # we want to remove the _ARRAY_DIMENSIONS from the final variables' .attrs + dims = zattrs.pop("_ARRAY_DIMENSIONS") + if chunk_dict: + manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict) + varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) + elif len(zarray.shape) != 0: + # empty variables don't have physical chunks, but zarray shows that the variable + # is at least 1D + shape = determine_chunk_grid_shape(zarray.shape, zarray.chunks) + manifest = ChunkManifest(entries={}, shape=shape) + varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) + else: + # This means we encountered a scalar variable of dimension 0, + # very likely that it actually has no numeric value and its only purpose + # is to communicate dataset attributes. + varr = zarray.fill_value + + return Variable(data=varr, dims=dims, attrs=zattrs) + + +def find_var_names(ds_reference_dict: KerchunkStoreRefs) -> list[str]: + """Find the names of zarr variables in this store/group.""" + + refs = ds_reference_dict["refs"] + found_var_names = {key.split("/")[0] for key in refs.keys() if "/" in key} + + return list(found_var_names) + + +def extract_array_refs( + ds_reference_dict: KerchunkStoreRefs, var_name: str +) -> KerchunkArrRefs: + """Extract only the part of the kerchunk reference dict that is relevant to this one zarr array""" + + found_var_names = find_var_names(ds_reference_dict) + + refs = ds_reference_dict["refs"] + if var_name in found_var_names: + # TODO these function probably have more loops in them than they need to... + + arr_refs = { + key.split("/")[1]: refs[key] + for key in refs.keys() + if var_name == key.split("/")[0] + } + + return fully_decode_arr_refs(arr_refs) + + else: + raise KeyError( + f"Could not find zarr array variable name {var_name}, only {found_var_names}" + ) + + +def parse_array_refs( + arr_refs: KerchunkArrRefs, +) -> tuple[dict, ZArray, ZAttrs]: + zarray = ZArray.from_kerchunk_refs(arr_refs.pop(".zarray")) + zattrs = arr_refs.pop(".zattrs", {}) + chunk_dict = arr_refs + + return chunk_dict, zarray, zattrs + + +def fully_decode_arr_refs(d: dict) -> KerchunkArrRefs: + """ + Only have to do this because kerchunk.SingleHdf5ToZarr apparently doesn't bother converting .zarray and .zattrs contents to dicts, see https://github.com/fsspec/kerchunk/issues/415 . + """ + import ujson + + sanitized = d.copy() + for k, v in d.items(): + if k.startswith("."): + # ensure contents of .zattrs and .zarray are python dictionaries + sanitized[k] = ujson.loads(v) + + return cast(KerchunkArrRefs, sanitized) diff --git a/virtualizarr/types/__init__.py b/virtualizarr/types/__init__.py new file mode 100644 index 00000000..34cd4bde --- /dev/null +++ b/virtualizarr/types/__init__.py @@ -0,0 +1,3 @@ +from virtualizarr.types.general import ChunkKey # type: ignore[F401] + +__all__ = ["ChunkKey"] diff --git a/virtualizarr/types.py b/virtualizarr/types/general.py similarity index 100% rename from virtualizarr/types.py rename to virtualizarr/types/general.py diff --git a/virtualizarr/types/kerchunk.py b/virtualizarr/types/kerchunk.py new file mode 100644 index 00000000..d124cca3 --- /dev/null +++ b/virtualizarr/types/kerchunk.py @@ -0,0 +1,13 @@ +from typing import NewType + +# Distinguishing these via type hints makes it a lot easier to mentally keep track of what the opaque kerchunk "reference dicts" actually mean +# (idea from https://kobzol.github.io/rust/python/2023/05/20/writing-python-like-its-rust.html) +# TODO I would prefer to be more specific about these types +KerchunkStoreRefs = NewType( + "KerchunkStoreRefs", + dict, # dict_keys(['version', 'refs']) +) # top-level dict containing kerchunk version and 'refs' dictionary which assumes single '.zgroup' key and multiple KerchunkArrRefs +KerchunkArrRefs = NewType( + "KerchunkArrRefs", + dict, # dict_keys(['.zarray', '.zattrs', '0.0', '0.1', ...) +) # lower-level dict defining a single Zarr Array, with keys for '.zarray', '.zattrs', and every chunk diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index 4899d41d..c9260aa6 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -1,7 +1,7 @@ from __future__ import annotations import io -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Iterable, Optional, Union if TYPE_CHECKING: import fsspec.core @@ -13,49 +13,76 @@ ] -def _fsspec_openfile_from_filepath( - *, - filepath: str, - reader_options: Optional[dict] = {}, -) -> OpenFileType: - """Converts input filepath to fsspec openfile object. +from dataclasses import dataclass, field + + +@dataclass +class _FsspecFSFromFilepath: + """Class to create fsspec Filesystem from input filepath. Parameters ---------- filepath : str Input filepath - reader_options : _type_, optional - Dict containing kwargs to pass to file opener, by default {'storage_options':{'key':'', 'secret':'', 'anon':True}} - - Returns - ------- - OpenFileType - An open file-like object, specific to the protocol supplied in filepath. - - Raises - ------ - NotImplementedError - Raises a Not Implemented Error if filepath protocol is not supported. + reader_options : dict, optional + dict containing kwargs to pass to file opener, by default {} + fs : Option | None + The fsspec filesystem object, created in __post_init__ + """ - import fsspec - from upath import UPath + filepath: str + reader_options: Optional[dict] = field(default_factory=dict) + fs: fsspec.AbstractFileSystem = field(init=False) - universal_filepath = UPath(filepath) - protocol = universal_filepath.protocol + def open_file(self) -> OpenFileType: + """Calls `.open` on fsspec.Filesystem instantiation using self.filepath as an input. - if protocol == "s3": - protocol_defaults = {"key": "", "secret": "", "anon": True} - else: - protocol_defaults = {} + Returns + ------- + OpenFileType + file opened with fsspec + """ + return self.fs.open(self.filepath) + + def read_bytes(self, bytes: int) -> bytes: + with self.open_file() as of: + return of.read(bytes) + + def __post_init__(self) -> None: + """Initialize the fsspec filesystem object""" + import fsspec + from upath import UPath - if reader_options is None: - reader_options = {} + universal_filepath = UPath(self.filepath) + protocol = universal_filepath.protocol - storage_options = reader_options.get("storage_options", {}) # type: ignore + self.reader_options = self.reader_options or {} + storage_options = self.reader_options.get("storage_options", {}) # type: ignore + + self.fs = fsspec.filesystem(protocol, **storage_options) + + +def check_for_collisions( + drop_variables: Iterable[str] | None, + loadable_variables: Iterable[str] | None, +) -> tuple[list[str], list[str]]: + if drop_variables is None: + drop_variables = [] + elif isinstance(drop_variables, str): + drop_variables = [drop_variables] + else: + drop_variables = list(drop_variables) + + if loadable_variables is None: + loadable_variables = [] + elif isinstance(loadable_variables, str): + loadable_variables = [loadable_variables] + else: + loadable_variables = list(loadable_variables) - # using dict merge operator to add in defaults if keys are not specified - storage_options = protocol_defaults | storage_options - fpath = fsspec.filesystem(protocol, **storage_options).open(filepath) + common = set(drop_variables).intersection(set(loadable_variables)) + if common: + raise ValueError(f"Cannot both load and drop variables {common}") - return fpath + return drop_variables, loadable_variables diff --git a/virtualizarr/writers/__init__.py b/virtualizarr/writers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/virtualizarr/writers/kerchunk.py b/virtualizarr/writers/kerchunk.py new file mode 100644 index 00000000..3a0bd27b --- /dev/null +++ b/virtualizarr/writers/kerchunk.py @@ -0,0 +1,125 @@ +import base64 +import json +from typing import cast + +import numpy as np +from xarray import Dataset +from xarray.coding.times import CFDatetimeCoder +from xarray.core.variable import Variable + +from virtualizarr.manifests.manifest import join +from virtualizarr.types.kerchunk import KerchunkArrRefs, KerchunkStoreRefs +from virtualizarr.zarr import ZArray + + +class NumpyEncoder(json.JSONEncoder): + # TODO I don't understand how kerchunk gets around this problem of encoding numpy types (in the zattrs) whilst only using ujson + def default(self, obj): + if isinstance(obj, np.ndarray): + return obj.tolist() # Convert NumPy array to Python list + elif isinstance(obj, np.generic): + return obj.item() # Convert NumPy scalar to Python scalar + elif isinstance(obj, np.dtype): + return str(obj) + return json.JSONEncoder.default(self, obj) + + +def dataset_to_kerchunk_refs(ds: Dataset) -> KerchunkStoreRefs: + """ + Create a dictionary containing kerchunk-style store references from a single xarray.Dataset (which wraps ManifestArray objects). + """ + + import ujson + + all_arr_refs = {} + for var_name, var in ds.variables.items(): + arr_refs = variable_to_kerchunk_arr_refs(var, str(var_name)) + + prepended_with_var_name = { + f"{var_name}/{key}": val for key, val in arr_refs.items() + } + + all_arr_refs.update(prepended_with_var_name) + + zattrs = ds.attrs + if ds.coords: + coord_names = [str(x) for x in ds.coords] + # this weird concatenated string instead of a list of strings is inconsistent with how other features in the kerchunk references format are stored + # see https://github.com/zarr-developers/VirtualiZarr/issues/105#issuecomment-2187266739 + zattrs["coordinates"] = " ".join(coord_names) + + ds_refs = { + "version": 1, + "refs": { + ".zgroup": '{"zarr_format":2}', + ".zattrs": ujson.dumps(zattrs), + **all_arr_refs, + }, + } + + return cast(KerchunkStoreRefs, ds_refs) + + +def variable_to_kerchunk_arr_refs(var: Variable, var_name: str) -> KerchunkArrRefs: + """ + Create a dictionary containing kerchunk-style array references from a single xarray.Variable (which wraps either a ManifestArray or a numpy array). + + Partially encodes the inner dicts to json to match kerchunk behaviour (see https://github.com/fsspec/kerchunk/issues/415). + """ + from virtualizarr.manifests import ManifestArray + + if isinstance(var.data, ManifestArray): + marr = var.data + + arr_refs: dict[str, str | list[str | int]] = { + str(chunk_key): [entry["path"], entry["offset"], entry["length"]] + for chunk_key, entry in marr.manifest.dict().items() + } + + zarray = marr.zarray.replace(zarr_format=2) + + else: + try: + np_arr = var.to_numpy() + except AttributeError as e: + raise TypeError( + f"Can only serialize wrapped arrays of type ManifestArray or numpy.ndarray, but got type {type(var.data)}" + ) from e + + if var.encoding: + if "scale_factor" in var.encoding: + raise NotImplementedError( + f"Cannot serialize loaded variable {var_name}, as it is encoded with a scale_factor" + ) + if "offset" in var.encoding: + raise NotImplementedError( + f"Cannot serialize loaded variable {var_name}, as it is encoded with an offset" + ) + if "calendar" in var.encoding: + np_arr = CFDatetimeCoder().encode(var.copy(), name=var_name).values + + # This encoding is what kerchunk does when it "inlines" data, see https://github.com/fsspec/kerchunk/blob/a0c4f3b828d37f6d07995925b324595af68c4a19/kerchunk/hdf.py#L472 + byte_data = np_arr.tobytes() + # TODO do I really need to encode then decode like this? + inlined_data = (b"base64:" + base64.b64encode(byte_data)).decode("utf-8") + + # TODO can this be generalized to save individual chunks of a dask array? + # TODO will this fail for a scalar? + arr_refs = {join(0 for _ in np_arr.shape): inlined_data} + + zarray = ZArray( + chunks=np_arr.shape, + shape=np_arr.shape, + dtype=np_arr.dtype, + order="C", + fill_value=None, + ) + + zarray_dict = zarray.to_kerchunk_json() + arr_refs[".zarray"] = zarray_dict + + zattrs = {**var.attrs, **var.encoding} + zattrs["_ARRAY_DIMENSIONS"] = list(var.dims) + arr_refs[".zattrs"] = json.dumps(zattrs, separators=(",", ":"), cls=NumpyEncoder) + + return cast(KerchunkArrRefs, arr_refs) diff --git a/virtualizarr/writers/zarr.py b/virtualizarr/writers/zarr.py new file mode 100644 index 00000000..b3dc8f1a --- /dev/null +++ b/virtualizarr/writers/zarr.py @@ -0,0 +1,115 @@ +from pathlib import Path + +import numpy as np +from xarray import Dataset +from xarray.core.variable import Variable + +from virtualizarr.vendor.zarr.utils import json_dumps +from virtualizarr.zarr import ZArray + + +def dataset_to_zarr(ds: Dataset, storepath: str) -> None: + """ + Write an xarray dataset whose variables wrap ManifestArrays to a v3 Zarr store, writing chunk references into manifest.json files. + + Currently requires all variables to be backed by ManifestArray objects. + + Not very useful until some implementation of a Zarr reader can actually read these manifest.json files. + See https://github.com/zarr-developers/zarr-specs/issues/287 + + Parameters + ---------- + ds: xr.Dataset + storepath: str + """ + + from virtualizarr.manifests import ManifestArray + + _storepath = Path(storepath) + Path.mkdir(_storepath, exist_ok=False) + + # should techically loop over groups in a tree but a dataset corresponds to only one group + group_metadata = {"zarr_format": 3, "node_type": "group", "attributes": ds.attrs} + with open(_storepath / "zarr.json", "wb") as group_metadata_file: + group_metadata_file.write(json_dumps(group_metadata)) + + for name, var in ds.variables.items(): + array_dir = _storepath / str(name) + marr = var.data + + # TODO move this check outside the writing loop so we don't write an incomplete store on failure? + # TODO at some point this should be generalized to also write in-memory arrays as normal zarr chunks, see GH isse #62. + if not isinstance(marr, ManifestArray): + raise TypeError( + "Only xarray objects wrapping ManifestArrays can be written to zarr using this method, " + f"but variable {name} wraps an array of type {type(marr)}" + ) + + Path.mkdir(array_dir, exist_ok=False) + + # write the chunk references into a manifest.json file + # and the array metadata into a zarr.json file + to_zarr_json(var, array_dir) + + +def to_zarr_json(var: Variable, array_dir: Path) -> None: + """ + Write out both the zarr.json and manifest.json file into the given zarr array directory. + + Follows the Zarr v3 manifest storage transformer ZEP (see https://github.com/zarr-developers/zarr-specs/issues/287). + + Parameters + ---------- + var : xr.Variable + Must be wrapping a ManifestArray + dirpath : str + Zarr store array directory into which to write files. + """ + + marr = var.data + + marr.manifest.to_zarr_json(array_dir / "manifest.json") + + metadata = zarr_v3_array_metadata( + marr.zarray, [str(x) for x in var.dims], var.attrs + ) + with open(array_dir / "zarr.json", "wb") as metadata_file: + metadata_file.write(json_dumps(metadata)) + + +def zarr_v3_array_metadata(zarray: ZArray, dim_names: list[str], attrs: dict) -> dict: + """Construct a v3-compliant metadata dict from v2 zarray + information stored on the xarray variable.""" + # TODO it would be nice if we could use the zarr-python metadata.ArrayMetadata classes to do this conversion for us + + metadata = zarray.dict() + + # adjust to match v3 spec + metadata["zarr_format"] = 3 + metadata["node_type"] = "array" + metadata["data_type"] = str(np.dtype(metadata.pop("dtype"))) + metadata["chunk_grid"] = { + "name": "regular", + "configuration": {"chunk_shape": metadata.pop("chunks")}, + } + metadata["chunk_key_encoding"] = { + "name": "default", + "configuration": {"separator": "/"}, + } + metadata["codecs"] = zarray._v3_codec_pipeline() + metadata.pop("filters") + metadata.pop("compressor") + metadata.pop("order") + + # indicate that we're using the manifest storage transformer ZEP + metadata["storage_transformers"] = [ + { + "name": "chunk-manifest-json", + "configuration": {"manifest": "./manifest.json"}, + } + ] + + # add information from xarray object + metadata["dimension_names"] = dim_names + metadata["attributes"] = attrs + + return metadata diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py deleted file mode 100644 index df847173..00000000 --- a/virtualizarr/xarray.py +++ /dev/null @@ -1,532 +0,0 @@ -import warnings -from collections.abc import Iterable, Mapping, MutableMapping -from pathlib import Path -from typing import ( - Callable, - Literal, - Optional, - overload, -) - -import ujson # type: ignore -import xarray as xr -from xarray import register_dataset_accessor -from xarray.backends import BackendArray -from xarray.coding.times import CFDatetimeCoder -from xarray.core.indexes import Index, PandasIndex -from xarray.core.variable import IndexVariable - -import virtualizarr.kerchunk as kerchunk -from virtualizarr.kerchunk import FileType, KerchunkStoreRefs -from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.utils import _fsspec_openfile_from_filepath -from virtualizarr.zarr import ( - attrs_from_zarr_group_json, - dataset_to_zarr, - metadata_from_zarr_json, -) - - -class ManifestBackendArray(ManifestArray, BackendArray): - """Using this prevents xarray from wrapping the KerchunkArray in ExplicitIndexingAdapter etc.""" - - ... - - -def open_virtual_dataset( - filepath: str, - *, - filetype: FileType | None = None, - drop_variables: Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, - cftime_variables: Iterable[str] | None = None, - indexes: Mapping[str, Index] | None = None, - virtual_array_class=ManifestArray, - reader_options: Optional[dict] = None, -) -> xr.Dataset: - """ - Open a file or store as an xarray Dataset wrapping virtualized zarr arrays. - - No data variables will be loaded unless specified in the ``loadable_variables`` kwarg (in which case they will be xarray lazily indexed arrays). - - Xarray indexes can optionally be created (the default behaviour). To avoid creating any xarray indexes pass ``indexes={}``. - - Parameters - ---------- - filepath : str, default None - File path to open as a set of virtualized zarr arrays. - filetype : FileType, default None - Type of file to be opened. Used to determine which kerchunk file format backend to use. - Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3'}. - If not provided will attempt to automatically infer the correct filetype from header bytes. - drop_variables: list[str], default is None - Variables in the file to drop before returning. - loadable_variables: list[str], default is None - Variables in the file to open as lazy numpy/dask arrays instead of instances of virtual_array_class. - Default is to open all variables as virtual arrays (i.e. ManifestArray). - cftime_variables : list[str], default is None - Interpret the value of specified vars using cftime, returning a datetime. - These will be automatically re-encoded with cftime. This list must be a subset - of ``loadable_variables``. - indexes : Mapping[str, Index], default is None - Indexes to use on the returned xarray Dataset. - Default is None, which will read any 1D coordinate data to create in-memory Pandas indexes. - To avoid creating any indexes, pass indexes={}. - virtual_array_class - Virtual array class to use to represent the references to the chunks in each on-disk array. - Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. - reader_options: dict, default {'storage_options': {'key': '', 'secret': '', 'anon': True}} - Dict passed into Kerchunk file readers, to allow reading from remote filesystems. - Note: Each Kerchunk file reader has distinct arguments, so ensure reader_options match selected Kerchunk reader arguments. - - Returns - ------- - vds - An xarray Dataset containing instances of virtual_array_cls for each variable, or normal lazily indexed arrays for each variable in loadable_variables. - """ - - if drop_variables is None: - drop_variables = [] - elif isinstance(drop_variables, str): - drop_variables = [drop_variables] - else: - drop_variables = list(drop_variables) - if loadable_variables is None: - loadable_variables = [] - elif isinstance(loadable_variables, str): - loadable_variables = [loadable_variables] - else: - loadable_variables = list(loadable_variables) - common = set(drop_variables).intersection(set(loadable_variables)) - if common: - raise ValueError(f"Cannot both load and drop variables {common}") - - if cftime_variables is None: - cftime_variables = [] - elif isinstance(cftime_variables, str): - cftime_variables = [cftime_variables] - else: - cftime_variables = list(cftime_variables) - - if diff := (set(cftime_variables) - set(loadable_variables)): - missing_str = ", ".join([f"'{v}'" for v in diff]) - raise ValueError( - "All ``cftime_variables`` must be included in ``loadable_variables`` " - f"({missing_str} not in ``loadable_variables``)" - ) - - if virtual_array_class is not ManifestArray: - raise NotImplementedError() - - if filetype == "zarr_v3": - # TODO is there a neat way of auto-detecting this? - return open_virtual_dataset_from_v3_store( - storepath=filepath, drop_variables=drop_variables, indexes=indexes - ) - else: - if reader_options is None: - reader_options = { - "storage_options": {"key": "", "secret": "", "anon": True} - } - - # this is the only place we actually always need to use kerchunk directly - # TODO avoid even reading byte ranges for variables that will be dropped later anyway? - vds_refs = kerchunk.read_kerchunk_references_from_file( - filepath=filepath, - filetype=filetype, - reader_options=reader_options, - ) - virtual_vars = virtual_vars_from_kerchunk_refs( - vds_refs, - drop_variables=drop_variables + loadable_variables, - virtual_array_class=virtual_array_class, - ) - ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) - coord_names = ds_attrs.pop("coordinates", []) - - if indexes is None or len(loadable_variables) > 0: - # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... - # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references - # TODO really we probably want a dedicated xarray backend that iterates over all variables only once - fpath = _fsspec_openfile_from_filepath( - filepath=filepath, reader_options=reader_options - ) - - ds = xr.open_dataset( - fpath, drop_variables=drop_variables, decode_times=False - ) - - if indexes is None: - warnings.warn( - "Specifying `indexes=None` will create in-memory pandas indexes for each 1D coordinate, but concatenation of ManifestArrays backed by pandas indexes is not yet supported (see issue #18)." - "You almost certainly want to pass `indexes={}` to `open_virtual_dataset` instead." - ) - - # add default indexes by reading data from file - indexes = {name: index for name, index in ds.xindexes.items()} - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation - - loadable_vars = { - name: var - for name, var in ds.variables.items() - if name in loadable_variables - } - - for name in cftime_variables: - var = loadable_vars[name] - loadable_vars[name] = CFDatetimeCoder().decode(var, name=name) - - # if we only read the indexes we can just close the file right away as nothing is lazy - if loadable_vars == {}: - ds.close() - else: - loadable_vars = {} - indexes = {} - - vars = {**virtual_vars, **loadable_vars} - - data_vars, coords = separate_coords(vars, indexes, coord_names) - - vds = xr.Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) - - # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened - - return vds - - -def open_virtual_dataset_from_v3_store( - storepath: str, - drop_variables: list[str], - indexes: Mapping[str, Index] | None, -) -> xr.Dataset: - """ - Read a Zarr v3 store and return an xarray Dataset containing virtualized arrays. - """ - _storepath = Path(storepath) - - ds_attrs = attrs_from_zarr_group_json(_storepath / "zarr.json") - coord_names = ds_attrs.pop("coordinates", []) - - # TODO recursive glob to create a datatree - # Note: this .is_file() check should not be necessary according to the pathlib docs, but tests fail on github CI without it - # see https://github.com/TomNicholas/VirtualiZarr/pull/45#discussion_r1547833166 - all_paths = _storepath.glob("*/") - directory_paths = [p for p in all_paths if not p.is_file()] - - vars = {} - for array_dir in directory_paths: - var_name = array_dir.name - if var_name in drop_variables: - break - - zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json") - manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json")) - - marr = ManifestArray(chunkmanifest=manifest, zarray=zarray) - var = xr.Variable(data=marr, dims=dim_names, attrs=attrs) - vars[var_name] = var - - if indexes is None: - raise NotImplementedError() - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation - - data_vars, coords = separate_coords(vars, indexes, coord_names) - - ds = xr.Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) - - return ds - - -def virtual_vars_from_kerchunk_refs( - refs: KerchunkStoreRefs, - drop_variables: list[str] | None = None, - virtual_array_class=ManifestArray, -) -> Mapping[str, xr.Variable]: - """ - Translate a store-level kerchunk reference dict into aaset of xarray Variables containing virtualized arrays. - - Parameters - ---------- - drop_variables: list[str], default is None - Variables in the file to drop before returning. - virtual_array_class - Virtual array class to use to represent the references to the chunks in each on-disk array. - Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. - """ - - var_names = kerchunk.find_var_names(refs) - if drop_variables is None: - drop_variables = [] - var_names_to_keep = [ - var_name for var_name in var_names if var_name not in drop_variables - ] - - vars = { - var_name: variable_from_kerchunk_refs(refs, var_name, virtual_array_class) - for var_name in var_names_to_keep - } - return vars - - -def dataset_from_kerchunk_refs( - refs: KerchunkStoreRefs, - drop_variables: list[str] = [], - virtual_array_class: type = ManifestArray, - indexes: MutableMapping[str, Index] | None = None, -) -> xr.Dataset: - """ - Translate a store-level kerchunk reference dict into an xarray Dataset containing virtualized arrays. - - drop_variables: list[str], default is None - Variables in the file to drop before returning. - virtual_array_class - Virtual array class to use to represent the references to the chunks in each on-disk array. - Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. - """ - - vars = virtual_vars_from_kerchunk_refs(refs, drop_variables, virtual_array_class) - ds_attrs = kerchunk.fully_decode_arr_refs(refs["refs"]).get(".zattrs", {}) - coord_names = ds_attrs.pop("coordinates", []) - - if indexes is None: - indexes = {} - data_vars, coords = separate_coords(vars, indexes, coord_names) - - vds = xr.Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) - - return vds - - -def variable_from_kerchunk_refs( - refs: KerchunkStoreRefs, var_name: str, virtual_array_class -) -> xr.Variable: - """Create a single xarray Variable by reading specific keys of a kerchunk references dict.""" - - arr_refs = kerchunk.extract_array_refs(refs, var_name) - chunk_dict, zarray, zattrs = kerchunk.parse_array_refs(arr_refs) - - manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict) - - # we want to remove the _ARRAY_DIMENSIONS from the final variables' .attrs - dims = zattrs.pop("_ARRAY_DIMENSIONS") - - varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) - - return xr.Variable(data=varr, dims=dims, attrs=zattrs) - - -def separate_coords( - vars: Mapping[str, xr.Variable], - indexes: MutableMapping[str, Index], - coord_names: Iterable[str] | None = None, -) -> tuple[Mapping[str, xr.Variable], xr.Coordinates]: - """ - Try to generate a set of coordinates that won't cause xarray to automatically build a pandas.Index for the 1D coordinates. - - Currently requires this function as a workaround unless xarray PR #8124 is merged. - - Will also preserve any loaded variables and indexes it is passed. - """ - - if coord_names is None: - coord_names = [] - - # split data and coordinate variables (promote dimension coordinates) - data_vars = {} - coord_vars = {} - for name, var in vars.items(): - if name in coord_names or var.dims == (name,): - # use workaround to avoid creating IndexVariables described here https://github.com/pydata/xarray/pull/8107#discussion_r1311214263 - if len(var.dims) == 1: - dim1d, *_ = var.dims - coord_vars[name] = (dim1d, var.data, var.attrs, var.encoding) - - if isinstance(var, IndexVariable): - # unless variable actually already is a loaded IndexVariable, - # in which case we need to keep it and add the corresponding indexes explicitly - coord_vars[name] = var - # TODO this seems suspect - will it handle datetimes? - indexes[name] = PandasIndex(var, dim1d) - else: - coord_vars[name] = var - else: - data_vars[name] = var - - coords = xr.Coordinates(coord_vars, indexes=indexes) - - return data_vars, coords - - -@register_dataset_accessor("virtualize") -class VirtualiZarrDatasetAccessor: - """ - Xarray accessor for writing out virtual datasets to disk. - - Methods on this object are called via `ds.virtualize.{method}`. - """ - - def __init__(self, ds: xr.Dataset): - self.ds: xr.Dataset = ds - - def to_zarr(self, storepath: str) -> None: - """ - Serialize all virtualized arrays in this xarray dataset as a Zarr store. - - Currently requires all variables to be backed by ManifestArray objects. - - Not very useful until some implementation of a Zarr reader can actually read these manifest.json files. - See https://github.com/zarr-developers/zarr-specs/issues/287 - - Parameters - ---------- - storepath : str - """ - dataset_to_zarr(self.ds, storepath) - - @overload - def to_kerchunk( - self, filepath: None, format: Literal["dict"] - ) -> KerchunkStoreRefs: ... - - @overload - def to_kerchunk(self, filepath: str | Path, format: Literal["json"]) -> None: ... - - @overload - def to_kerchunk( - self, - filepath: str | Path, - format: Literal["parquet"], - record_size: int = 100_000, - categorical_threshold: int = 10, - ) -> None: ... - - def to_kerchunk( - self, - filepath: str | Path | None = None, - format: Literal["dict", "json", "parquet"] = "dict", - record_size: int = 100_000, - categorical_threshold: int = 10, - ) -> KerchunkStoreRefs | None: - """ - Serialize all virtualized arrays in this xarray dataset into the kerchunk references format. - - Parameters - ---------- - filepath : str, default: None - File path to write kerchunk references into. Not required if format is 'dict'. - format : 'dict', 'json', or 'parquet' - Format to serialize the kerchunk references as. - If 'json' or 'parquet' then the 'filepath' argument is required. - record_size (parquet only): int - Number of references to store in each reference file (default 100,000). Bigger values - mean fewer read requests but larger memory footprint. - categorical_threshold (parquet only) : int - Encode urls as pandas.Categorical to reduce memory footprint if the ratio - of the number of unique urls to total number of refs for each variable - is greater than or equal to this number. (default 10) - - References - ---------- - https://fsspec.github.io/kerchunk/spec.html - """ - refs = kerchunk.dataset_to_kerchunk_refs(self.ds) - - if format == "dict": - return refs - elif format == "json": - if filepath is None: - raise ValueError("Filepath must be provided when format is 'json'") - - with open(filepath, "w") as json_file: - ujson.dump(refs, json_file) - - return None - elif format == "parquet": - from kerchunk.df import refs_to_dataframe - - if isinstance(filepath, Path): - url = str(filepath) - elif isinstance(filepath, str): - url = filepath - - # refs_to_dataframe is responsible for writing to parquet. - # at no point does it create a full in-memory dataframe. - refs_to_dataframe( - refs, - url=url, - record_size=record_size, - categorical_threshold=categorical_threshold, - ) - return None - else: - raise ValueError(f"Unrecognized output format: {format}") - - def rename_paths( - self, - new: str | Callable[[str], str], - ) -> xr.Dataset: - """ - Rename paths to chunks in every ManifestArray in this dataset. - - Accepts either a string, in which case this new path will be used for all chunks, or - a function which accepts the old path and returns the new path. - - Parameters - ---------- - new - New path to use for all chunks, either as a string, or as a function which accepts and returns strings. - - Returns - ------- - Dataset - - Examples - -------- - Rename paths to reflect moving the referenced files from local storage to an S3 bucket. - - >>> def local_to_s3_url(old_local_path: str) -> str: - ... from pathlib import Path - ... - ... new_s3_bucket_url = "http://s3.amazonaws.com/my_bucket/" - ... - ... filename = Path(old_local_path).name - ... return str(new_s3_bucket_url / filename) - - >>> ds.virtualize.rename_paths(local_to_s3_url) - - See Also - -------- - ManifestArray.rename_paths - ChunkManifest.rename_paths - """ - - new_ds = self.ds.copy() - for var_name in new_ds.variables: - data = new_ds[var_name].data - if isinstance(data, ManifestArray): - new_ds[var_name].data = data.rename_paths(new=new) - - return new_ds diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 545a86fc..4b3fdd53 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -1,19 +1,7 @@ -import json -from pathlib import Path -from typing import ( - TYPE_CHECKING, - Any, - Literal, - NewType, - Optional, -) +import dataclasses +from typing import TYPE_CHECKING, Any, Literal, NewType, cast import numpy as np -import ujson # type: ignore -import xarray as xr -from pydantic import BaseModel, ConfigDict, field_validator - -from virtualizarr.vendor.zarr.utils import json_dumps if TYPE_CHECKING: pass @@ -22,41 +10,46 @@ ZAttrs = NewType( "ZAttrs", dict[str, Any] ) # just the .zattrs (for one array or for the whole store/group) - - -class Codec(BaseModel): - compressor: str | None = None +FillValueT = bool | str | float | int | list | None +ZARR_FORMAT = Literal[2, 3] + +ZARR_DEFAULT_FILL_VALUE: dict[str, FillValueT] = { + # numpy dtypes's hierarchy lets us avoid checking for all the widths + # https://numpy.org/doc/stable/reference/arrays.scalars.html + np.dtype("bool").kind: False, + np.dtype("int").kind: 0, + np.dtype("float").kind: 0.0, + np.dtype("complex").kind: [0.0, 0.0], + np.dtype("datetime64").kind: 0, +} +""" +The value and format of the fill_value depend on the `data_type` of the array. +See here for spec: +https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value +""" + + +@dataclasses.dataclass +class Codec: + compressor: dict | None = None filters: list[dict] | None = None - def __repr__(self) -> str: - return f"Codec(compressor={self.compressor}, filters={self.filters})" - -class ZArray(BaseModel): +@dataclasses.dataclass +class ZArray: """Just the .zarray information""" # TODO will this work for V3? - model_config = ConfigDict( - arbitrary_types_allowed=True, # only here so pydantic doesn't complain about the numpy dtype field - ) - + shape: tuple[int, ...] chunks: tuple[int, ...] - compressor: str | None = None dtype: np.dtype - fill_value: float | int | None = np.nan # float or int? + fill_value: FillValueT = dataclasses.field(default=None) + order: Literal["C", "F"] = "C" + compressor: dict | None = None filters: list[dict] | None = None - order: Literal["C", "F"] - shape: tuple[int, ...] zarr_format: Literal[2, 3] = 2 - @field_validator("dtype") - @classmethod - def validate_dtype(cls, dtype) -> np.dtype: - # Your custom validation logic here - # Convert numpy.dtype to a format suitable for Pydantic - return np.dtype(dtype) - def __post_init__(self) -> None: if len(self.shape) != len(self.chunks): raise ValueError( @@ -64,14 +57,18 @@ def __post_init__(self) -> None: f"Array shape {self.shape} has ndim={self.shape} but chunk shape {self.chunks} has ndim={len(self.chunks)}" ) + if isinstance(self.dtype, str): + # Convert dtype string to numpy.dtype + self.dtype = np.dtype(self.dtype) + + if self.fill_value is None: + self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype.kind, 0.0) + @property def codec(self) -> Codec: """For comparison against other arrays.""" return Codec(compressor=self.compressor, filters=self.filters) - def __repr__(self) -> str: - return f"ZArray(shape={self.shape}, chunks={self.chunks}, dtype={self.dtype}, compressor={self.compressor}, filters={self.filters}, fill_value={self.fill_value})" - @classmethod def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": # coerce type of fill_value as kerchunk can be inconsistent with this @@ -80,10 +77,9 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": fill_value = np.nan compressor = decoded_arr_refs_zarray["compressor"] - # deal with an inconsistency in kerchunk's tiff_to_zarr function - # TODO should this be moved to the point where we actually call tiff_to_zarr? Or ideally made consistent upstream. - if compressor is not None and "id" in compressor: - compressor = compressor["id"] + zarr_format = int(decoded_arr_refs_zarray["zarr_format"]) + if zarr_format not in (2, 3): + raise ValueError(f"Zarr format must be 2 or 3, but got {zarr_format}") return ZArray( chunks=tuple(decoded_arr_refs_zarray["chunks"]), @@ -93,46 +89,111 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": filters=decoded_arr_refs_zarray["filters"], order=decoded_arr_refs_zarray["order"], shape=tuple(decoded_arr_refs_zarray["shape"]), - zarr_format=int(decoded_arr_refs_zarray["zarr_format"]), + zarr_format=cast(ZARR_FORMAT, zarr_format), ) def dict(self) -> dict[str, Any]: - zarray_dict = dict(self) - + zarray_dict = dataclasses.asdict(self) zarray_dict["dtype"] = encode_dtype(zarray_dict["dtype"]) - - if zarray_dict["fill_value"] is np.nan: - zarray_dict["fill_value"] = None - return zarray_dict def to_kerchunk_json(self) -> str: - return ujson.dumps(self.dict()) + import ujson + zarray_dict = self.dict() + if zarray_dict["fill_value"] is np.nan: + zarray_dict["fill_value"] = None + return ujson.dumps(zarray_dict) + + # ZArray.dict seems to shadow "dict", so we need the type ignore in + # the signature below. def replace( self, - chunks: Optional[tuple[int, ...]] = None, - compressor: Optional[str] = None, - dtype: Optional[np.dtype] = None, - fill_value: Optional[float] = None, # float or int? - filters: Optional[list[dict]] = None, # type: ignore[valid-type] - order: Optional[Literal["C"] | Literal["F"]] = None, - shape: Optional[tuple[int, ...]] = None, - zarr_format: Optional[Literal[2] | Literal[3]] = None, + shape: tuple[int, ...] | None = None, + chunks: tuple[int, ...] | None = None, + dtype: np.dtype | str | None = None, + fill_value: FillValueT = None, + order: Literal["C", "F"] | None = None, + compressor: "dict | None" = None, # type: ignore[valid-type] + filters: list[dict] | None = None, # type: ignore[valid-type] + zarr_format: Literal[2, 3] | None = None, ) -> "ZArray": """ Convenience method to create a new ZArray from an existing one by altering only certain attributes. """ - return ZArray( - chunks=chunks if chunks is not None else self.chunks, - compressor=compressor if compressor is not None else self.compressor, - dtype=dtype if dtype is not None else self.dtype, - fill_value=fill_value if fill_value is not None else self.fill_value, - filters=filters if filters is not None else self.filters, - shape=shape if shape is not None else self.shape, - order=order if order is not None else self.order, - zarr_format=zarr_format if zarr_format is not None else self.zarr_format, - ) + replacements: dict[str, Any] = {} + if shape is not None: + replacements["shape"] = shape + if chunks is not None: + replacements["chunks"] = chunks + if dtype is not None: + replacements["dtype"] = dtype + if fill_value is not None: + replacements["fill_value"] = fill_value + if order is not None: + replacements["order"] = order + if compressor is not None: + replacements["compressor"] = compressor + if filters is not None: + replacements["filters"] = filters + if zarr_format is not None: + replacements["zarr_format"] = zarr_format + return dataclasses.replace(self, **replacements) + + def _v3_codec_pipeline(self) -> list: + """ + VirtualiZarr internally uses the `filters`, `compressor`, and `order` attributes + from zarr v2, but to create conformant zarr v3 metadata those 3 must be turned into `codecs` objects. + Not all codecs are created equal though: https://github.com/zarr-developers/zarr-python/issues/1943 + An array _must_ declare a single ArrayBytes codec, and 0 or more ArrayArray, BytesBytes codecs. + Roughly, this is the mapping: + ``` + filters: Iterable[ArrayArrayCodec] #optional + compressor: ArrayBytesCodec #mandatory + post_compressor: Iterable[BytesBytesCodec] #optional + ``` + """ + import numcodecs + + if self.filters: + filter_codecs_configs = [ + numcodecs.get_codec(filter).get_config() for filter in self.filters + ] + filters = [ + dict(name=codec.pop("id"), configuration=codec) + for codec in filter_codecs_configs + ] + else: + filters = [] + + # Noting here that zarr v3 has very few codecs specificed in the official spec, + # and that there are far more codecs in `numcodecs`. We take a gamble and assume + # that the codec names and configuration are simply mapped into zarrv3 "configurables". + if self.compressor: + compressor = [_num_codec_config_to_configurable(self.compressor)] + else: + compressor = [] + + # https://zarr-specs.readthedocs.io/en/latest/v3/codecs/transpose/v1.0.html#transpose-codec-v1 + # Either "C" or "F", defining the layout of bytes within each chunk of the array. + # "C" means row-major order, i.e., the last dimension varies fastest; + # "F" means column-major order, i.e., the first dimension varies fastest. + if self.order == "C": + order = tuple(range(len(self.shape))) + elif self.order == "F": + order = tuple(reversed(range(len(self.shape)))) + + transpose = dict(name="transpose", configuration=dict(order=order)) + # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097 + # "If no ArrayBytesCodec is supplied, we can auto-add a BytesCodec" + bytes = dict( + name="bytes", configuration={} + ) # TODO need to handle endianess configuration + + # The order here is significant! + # [ArrayArray] -> ArrayBytes -> [BytesBytes] + codec_pipeline = [transpose, bytes] + compressor + filters + return codec_pipeline def encode_dtype(dtype: np.dtype) -> str: @@ -149,149 +210,15 @@ def ceildiv(a: int, b: int) -> int: return -(a // -b) -def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: - """ - Write an xarray dataset whose variables wrap ManifestArrays to a v3 Zarr store, writing chunk references into manifest.json files. - - Currently requires all variables to be backed by ManifestArray objects. +def determine_chunk_grid_shape( + shape: tuple[int, ...], chunks: tuple[int, ...] +) -> tuple[int, ...]: + return tuple(ceildiv(length, chunksize) for length, chunksize in zip(shape, chunks)) - Not very useful until some implementation of a Zarr reader can actually read these manifest.json files. - See https://github.com/zarr-developers/zarr-specs/issues/287 - Parameters - ---------- - ds: xr.Dataset - storepath: str +def _num_codec_config_to_configurable(num_codec: dict) -> dict: """ - - from virtualizarr.manifests import ManifestArray - - _storepath = Path(storepath) - Path.mkdir(_storepath, exist_ok=False) - - # should techically loop over groups in a tree but a dataset corresponds to only one group - group_metadata = {"zarr_format": 3, "node_type": "group", "attributes": ds.attrs} - with open(_storepath / "zarr.json", "wb") as group_metadata_file: - group_metadata_file.write(json_dumps(group_metadata)) - - for name, var in ds.variables.items(): - array_dir = _storepath / name - marr = var.data - - # TODO move this check outside the writing loop so we don't write an incomplete store on failure? - # TODO at some point this should be generalized to also write in-memory arrays as normal zarr chunks, see GH isse #62. - if not isinstance(marr, ManifestArray): - raise TypeError( - "Only xarray objects wrapping ManifestArrays can be written to zarr using this method, " - f"but variable {name} wraps an array of type {type(marr)}" - ) - - Path.mkdir(array_dir, exist_ok=False) - - # write the chunk references into a manifest.json file - # and the array metadata into a zarr.json file - to_zarr_json(var, array_dir) - - -def to_zarr_json(var: xr.Variable, array_dir: Path) -> None: - """ - Write out both the zarr.json and manifest.json file into the given zarr array directory. - - Follows the Zarr v3 manifest storage transformer ZEP (see https://github.com/zarr-developers/zarr-specs/issues/287). - - Parameters - ---------- - var : xr.Variable - Must be wrapping a ManifestArray - dirpath : str - Zarr store array directory into which to write files. + Convert a numcodecs codec into a zarr v3 configurable. """ - - marr = var.data - - marr.manifest.to_zarr_json(array_dir / "manifest.json") - - metadata = zarr_v3_array_metadata(marr.zarray, list(var.dims), var.attrs) - with open(array_dir / "zarr.json", "wb") as metadata_file: - metadata_file.write(json_dumps(metadata)) - - -def zarr_v3_array_metadata(zarray: ZArray, dim_names: list[str], attrs: dict) -> dict: - """Construct a v3-compliant metadata dict from v2 zarray + information stored on the xarray variable.""" - # TODO it would be nice if we could use the zarr-python metadata.ArrayMetadata classes to do this conversion for us - - metadata = zarray.dict() - - # adjust to match v3 spec - metadata["zarr_format"] = 3 - metadata["node_type"] = "array" - metadata["data_type"] = str(np.dtype(metadata.pop("dtype"))) - metadata["chunk_grid"] = { - "name": "regular", - "configuration": {"chunk_shape": metadata.pop("chunks")}, - } - metadata["chunk_key_encoding"] = { - "name": "default", - "configuration": {"separator": "/"}, - } - metadata["codecs"] = metadata.pop("filters") - metadata.pop("compressor") # TODO this should be entered in codecs somehow - metadata.pop("order") # TODO this should be replaced by a transpose codec - - # indicate that we're using the manifest storage transformer ZEP - metadata["storage_transformers"] = [ - { - "name": "chunk-manifest-json", - "configuration": {"manifest": "./manifest.json"}, - } - ] - - # add information from xarray object - metadata["dimension_names"] = dim_names - metadata["attributes"] = attrs - - return metadata - - -def attrs_from_zarr_group_json(filepath: Path) -> dict: - with open(filepath) as metadata_file: - attrs = json.load(metadata_file) - return attrs["attributes"] - - -def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: - with open(filepath) as metadata_file: - metadata = json.load(metadata_file) - - if { - "name": "chunk-manifest-json", - "configuration": { - "manifest": "./manifest.json", - }, - } not in metadata.get("storage_transformers", []): - raise ValueError( - "Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP." - ) - - attrs = metadata.pop("attributes") - dim_names = metadata.pop("dimension_names") - - chunk_shape = metadata["chunk_grid"]["configuration"]["chunk_shape"] - - if metadata["fill_value"] is None: - fill_value = np.nan - else: - fill_value = metadata["fill_value"] - - zarray = ZArray( - chunks=metadata["chunk_grid"]["configuration"]["chunk_shape"], - compressor=metadata["codecs"], - dtype=np.dtype(metadata["data_type"]), - fill_value=fill_value, - filters=metadata.get("filters", None), - order="C", - shape=chunk_shape, - zarr_format=3, - ) - - return zarray, dim_names, attrs + num_codec_copy = num_codec.copy() + return {"name": num_codec_copy.pop("id"), "configuration": num_codec_copy}