Merge branch 'main' into guhidalgo/addzarrv3dependency

zarr-developers · Oct 20, 2024 · e4534b6 · e4534b6
2 parents 933e575 + b1ae3fa
commit e4534b6
Show file tree

Hide file tree

Showing 66 changed files with 4,326 additions and 1,711 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -12,6 +12,10 @@ on:
   schedule:
     - cron: "0 0 * * *"
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
 
   test:

diff --git a/.github/workflows/min-deps.yml b/.github/workflows/min-deps.yml
@@ -0,0 +1,60 @@
+name: min-deps
+
+on:
+  push:
+    branches: [ "main" ]
+    paths-ignore:
+    - 'docs/**'
+  pull_request:
+    branches: [ "main" ]
+    paths-ignore:
+    - 'docs/**'
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+
+  test:
+    name: ${{ matrix.python-version }}-build
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -l {0}
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup micromamba
+        uses: mamba-org/setup-micromamba@v1
+        with:
+          environment-file: ci/min-deps.yml
+          cache-environment: true
+          create-args: >-
+            python=${{matrix.python-version}}
+
+      - name: Install virtualizarr
+        run: |
+           python -m pip install -e . --no-deps
+      - name: Conda list information
+        run: |
+          conda env list
+          conda list
+
+      - name: Running Tests
+        run: |
+          python -m pytest ./virtualizarr --cov=./ --cov-report=xml --verbose
+
+      - name: Upload code coverage to Codecov
+        uses: codecov/[email protected]
+        with:
+          file: ./coverage.xml
+          flags: unittests
+          env_vars: OS,PYTHON
+          name: codecov-umbrella
+          fail_ci_if_error: false
diff --git a/.github/workflows/typing.yml b/.github/workflows/typing.yml
@@ -0,0 +1,38 @@
+name: Typing
+
+on:
+  push:
+    branches: [ "main" ]
+    paths-ignore:
+    - 'docs/**'
+  pull_request:
+    branches: [ "main" ]
+    paths-ignore:
+    - 'docs/**'
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  mypy:
+    name: mypy
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.12'
+
+      - name: Install deps
+        run: |
+          # We need to test optional dep to add all the library stubs
+          pip install -e '.[test]'
+
+      - name: Type check
+        run: |
+          mypy virtualizarr
diff --git a/.gitignore b/.gitignore
@@ -160,3 +160,4 @@ cython_debug/
 #.idea/
 virtualizarr/_version.py
 docs/generated/
+examples/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,42 +3,18 @@ ci:
   autoupdate_schedule: monthly
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
       - id: check-yaml
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: "v0.4.7"
+    rev: "v0.6.9"
     hooks:
       # Run the linter.
       - id: ruff
         args: [ --fix ]
       # Run the formatter.
       - id: ruff-format
-
-  - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.10.0
-    hooks:
-      - id: mypy
-        # Copied from setup.cfg
-        exclude: "properties|asv_bench|docs"
-        additional_dependencies: [
-            # Type stubs
-            types-python-dateutil,
-            types-pkg_resources,
-            types-PyYAML,
-            types-pytz,
-            # Dependencies that are typed
-            numpy,
-            typing-extensions>=4.1.0,
-          ]
-  # run this occasionally, ref discussion https://github.com/pydata/xarray/pull/3194
-  # - repo: https://github.com/asottile/pyupgrade
-  #   rev: v3.15.2
-  #   hooks:
-  #     - id: pyupgrade
-  #       args:
-  #         - "--py310-plus"
diff --git a/README.md b/README.md
@@ -4,6 +4,8 @@
 
 VirtualiZarr (pronounced like "virtualize" but more piratey) grew out of [discussions](https://github.com/fsspec/kerchunk/issues/377) on the [kerchunk repository](https://github.com/fsspec/kerchunk), and is an attempt to provide the game-changing power of kerchunk in a zarr-native way, and with a familiar array-like API.
 
+You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk.
+
 _Please see the [documentation](https://virtualizarr.readthedocs.io/en/latest/)_
 
 ### Development Status and Roadmap

diff --git a/ci/doc.yml b/ci/doc.yml
@@ -13,4 +13,3 @@ dependencies:
       - "sphinx_design"
       - "sphinx_togglebutton"
       - "sphinx-autodoc-typehints"
-      - -e  "..[test]"
diff --git a/ci/environment.yml b/ci/environment.yml
@@ -9,15 +9,16 @@ dependencies:
   - netcdf4
   - xarray>=2024.6.0
   - kerchunk>=0.2.5
-  - pydantic
   - numpy>=2.0.0
   - ujson
   - packaging
   - universal_pathlib
   # Testing
   - codecov
   - pre-commit
+  - mypy
   - ruff
+  - pandas-stubs
   - pytest-mypy
   - pytest-cov
   - pytest

diff --git a/ci/min-deps.yml b/ci/min-deps.yml
@@ -0,0 +1,26 @@
+name: virtualizarr-min-deps
+channels:
+  - conda-forge
+  - nodefaults
+dependencies:
+  - h5netcdf
+  - h5py
+  - hdf5
+  - netcdf4
+  - xarray>=2024.6.0
+  - numpy>=2.0.0
+  - numcodecs
+  - packaging
+  - ujson
+  - universal_pathlib
+  # Testing
+  - codecov
+  - pre-commit
+  - mypy
+  - ruff
+  - pandas-stubs
+  - pytest-mypy
+  - pytest-cov
+  - pytest
+  - pooch
+  - fsspec
diff --git a/conftest.py b/conftest.py
@@ -1,3 +1,4 @@
+import h5py
 import pytest
 import xarray as xr
 
@@ -32,6 +33,33 @@ def netcdf4_file(tmpdir):
     return filepath
 
 
+@pytest.fixture
+def netcdf4_virtual_dataset(netcdf4_file):
+    from virtualizarr import open_virtual_dataset
+
+    return open_virtual_dataset(netcdf4_file, indexes={})
+
+
+@pytest.fixture
+def netcdf4_inlined_ref(netcdf4_file):
+    from kerchunk.hdf import SingleHdf5ToZarr
+
+    return SingleHdf5ToZarr(netcdf4_file, inline_threshold=1000).translate()
+
+
+@pytest.fixture
+def hdf5_groups_file(tmpdir):
+    # Set up example xarray dataset
+    ds = xr.tutorial.open_dataset("air_temperature")
+
+    # Save it to disk as netCDF (in temporary directory)
+    filepath = f"{tmpdir}/air.nc"
+    ds.to_netcdf(filepath, format="NETCDF4", group="test/group")
+    ds.close()
+
+    return filepath
+
+
 @pytest.fixture
 def netcdf4_files(tmpdir):
     # Set up example xarray dataset
@@ -50,3 +78,21 @@ def netcdf4_files(tmpdir):
     ds2.close()
 
     return filepath1, filepath2
+
+
+@pytest.fixture
+def hdf5_empty(tmpdir):
+    filepath = f"{tmpdir}/empty.nc"
+    f = h5py.File(filepath, "w")
+    dataset = f.create_dataset("empty", shape=(), dtype="float32")
+    dataset.attrs["empty"] = "true"
+    return filepath
+
+
+@pytest.fixture
+def hdf5_scalar(tmpdir):
+    filepath = f"{tmpdir}/scalar.nc"
+    f = h5py.File(filepath, "w")
+    dataset = f.create_dataset("scalar", data=0.1, dtype="float32")
+    dataset.attrs["scalar"] = "true"
+    return filepath
diff --git a/docs/api.rst b/docs/api.rst
@@ -21,7 +21,7 @@ Manifests
 Reading
 =======
 
-.. currentmodule:: virtualizarr.xarray
+.. currentmodule:: virtualizarr.backend
 .. autosummary::
     :nosignatures:
     :toctree: generated/
@@ -32,7 +32,7 @@ Reading
 Serialization
 =============
 
-.. currentmodule:: virtualizarr.xarray
+.. currentmodule:: virtualizarr.accessor
 .. autosummary::
     :nosignatures:
     :toctree: generated/
@@ -44,7 +44,7 @@ Serialization
 Rewriting
 =============
 
-.. currentmodule:: virtualizarr.xarray
+.. currentmodule:: virtualizarr.accessor
 .. autosummary::
     :nosignatures:
     :toctree: generated/

diff --git a/docs/conf.py b/docs/conf.py
@@ -55,11 +55,23 @@
 
 html_theme = "pydata_sphinx_theme"
 html_theme_options = {
-    "repository_url": "https://github.com/TomNicholas/VirtualiZarr",
-    "repository_branch": "main",
-    "path_to_docs": "docs",
+    "use_edit_page_button": True,
+    "icon_links": [
+        {
+            "name": "GitHub",
+            "url": "https://github.com/zarr-developers/VirtualiZarr",
+            "icon": "fa-brands fa-github",
+            "type": "fontawesome",
+        },
+    ]
 }
 html_title = "VirtualiZarr"
+html_context = {
+    "github_user": "zarr-developers",
+    "github_repo": "VirtualiZarr",
+    "github_version": "main",
+    "doc_path": "docs",
+}
 
 # remove sidebar, see GH issue #82
 html_css_files = [

diff --git a/docs/faq.md b/docs/faq.md
@@ -16,6 +16,8 @@ The above steps would also be performed using the `kerchunk` library alone, but
 
 ## How do VirtualiZarr and Kerchunk compare?
 
+You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk.
+
 Users of kerchunk may find the following comparison table useful, which shows which features of kerchunk map on to which features of VirtualiZarr.
 | Component / Feature                                                      | Kerchunk                                                                                                                            | VirtualiZarr                                                                                                                                     |
 | ------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |

diff --git a/docs/index.md b/docs/index.md
@@ -4,6 +4,8 @@
 
 VirtualiZarr grew out of [discussions](https://github.com/fsspec/kerchunk/issues/377) on the [kerchunk repository](https://github.com/fsspec/kerchunk), and is an attempt to provide the game-changing power of kerchunk in a zarr-native way, and with a familiar array-like API.
 
+You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk.
+
 ## Motivation
 
 The Kerchunk idea solves an incredibly important problem: accessing big archival datasets via a cloud-optimized pattern, but without copying or modifying the original data in any way. This is a win-win-win for users, data engineers, and data providers. Users see fast-opening zarr-compliant stores that work performantly with libraries like xarray and dask, data engineers can provide this speed by adding a lightweight virtualization layer on top of existing data (without having to ask anyone's permission), and data providers don't have to change anything about their legacy files for them to be used in a cloud-optimized way.

diff --git a/docs/installation.md b/docs/installation.md
@@ -1,11 +1,15 @@
 # Installation
 
-Currently you need to clone VirtualiZarr and install it locally:
+VirtualiZarr is available on PyPI via pip:
 
 ```shell
-git clone https://github.com/zarr-developers/VirtualiZarr
-cd VirtualiZarr
-pip install -e .
+pip install virtualizarr
+```
+
+and on conda-forge:
+
+```shell
+conda install -c conda-forge virtualizarr
 ```