Merge branch 'main' into py_version_update

klindsay28 · Mar 29, 2024 · 247ee42 · 247ee42
2 parents 65c7f8a + e0b8f26
commit 247ee42
Show file tree

Hide file tree

Showing 31 changed files with 820 additions and 199 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -52,8 +52,10 @@ jobs:
 
       - name: Code Checks
         run: |
-          black --diff .
-          flake8
+          black --diff esm_catalog_utils tests
+          isort --diff esm_catalog_utils tests
+          flake8 esm_catalog_utils tests
+          mypy esm_catalog_utils tests
 
       - name: Run Tests
         run: |

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -0,0 +1,18 @@
+# Read the Docs configuration file for Sphinx projects
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+version: 2
+
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.9"
+
+python:
+  install:
+    - requirements: docs/requirements.txt
+    - method: pip
+      path: .
+
+sphinx:
+  configuration: docs/source/conf.py
diff --git a/README.md b/README.md
@@ -1,2 +1,7 @@
 # esm_catalog_utils
+
 tools/utilities to support the usage of catalogs to access and analyze ESM output
+
+## Documentation
+
+https://esm-catalog-utils.readthedocs.io/
diff --git a/ci/environment.yaml b/ci/environment.yaml
@@ -8,10 +8,14 @@ dependencies:
   - flake8
   - intake
   - intake-esm
+  - isort
+  - mypy
   - netCDF4
   - numpy
   - packaging
   - pandas
+  - pandas-stubs
+  - pydantic<2.0
   - pytest
-  - pyyaml
+  - types-pyyaml
   - xarray
diff --git a/ci/environment_py38.yaml b/ci/environment_py38.yaml
@@ -9,11 +9,14 @@ dependencies:
   - fsspec<2023.10.0
   - intake
   - intake-esm
+  - isort
+  - mypy
   - netCDF4
   - numpy
   - packaging
   - pandas
+  - pandas-stubs
   - pydantic<2.0
   - pytest
-  - pyyaml
+  - types-pyyaml
   - xarray
diff --git a/docs/environment.yaml b/docs/environment.yaml
@@ -2,8 +2,11 @@ name: esm_catalog_utils_docs
 channels:
   - conda-forge
 dependencies:
+  - python=3.9
   - black
   - flake8
-  - sphinx<6.0
-  - sphinx-rtd-theme
-  - urllib3<2.0
+  - furo
+  - intake-esm
+  - pydantic<2.0
+  - myst-nb
+  - sphinx
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,3 +1,5 @@
-sphinx<6.0
-sphinx_rtd_theme
-urllib3<2.0
+sphinx
+furo
+intake-esm
+pydantic<2.0
+myst-nb
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -0,0 +1,20 @@
+.. currentmodule:: esm_catalog_utils
+
+#############
+API reference
+#############
+
+This page provides an auto-generated summary of esm_catalog_utils' API.
+
+Top-level functions
+===================
+
+.. autosummary::
+   :toctree: generated/
+
+   caseroot_to_esm_datastore
+   directory_to_esm_datastore
+   caseroot_to_case_metadata
+   case_metadata_to_esm_datastore
+   parse_file_cesm
+   parse_path_cesm
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -3,24 +3,46 @@
 # For the full list of built-in configuration values, see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
 
+import os.path
+import sys
+
+sys.path.insert(0, os.path.abspath("../esm_catalog_utils/"))
+
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
 project = "esm_catalog_utils"
-copyright = "2023, Keith Lindsay"
-author = "Keith Lindsay"
+copyright = "2023"
+author = "@klindsay28"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 
-extensions = []
+extensions = [
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.napoleon",
+    "myst_nb",
+]
 
 templates_path = ["_templates"]
 exclude_patterns = []
 
+autosummary_generate = True
+autodoc_typehints = "none"
+
+nb_execution_mode = "off"
+
+intersphinx_mapping = {
+    "intake-esm": ("https://intake-esm.readthedocs.io/en/stable/", None),
+    "dask": ("https://docs.dask.org/en/stable/", None),
+    "dask.distributed": ("https://distributed.dask.org/en/stable/", None),
+    "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
+    "zarr": ("https://zarr.readthedocs.io/en/stable/", None),
+}
 
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 
-html_theme = "sphinx_rtd_theme"
-html_static_path = ["_static"]
+html_theme = "furo"
diff --git a/docs/source/developers_guide.rst b/docs/source/developers_guide.rst
@@ -0,0 +1,55 @@
+=================
+Developer's Guide
+=================
+
+Coding Style
+------------
+
+Code Formatting
+~~~~~~~~~~~~~~~
+
+The code of the package is formatted using the tools `black
+<https://black.readthedocs.io/>`_ and `isort <https://pycqa.github.io/isort/>`_.
+This ensures that the code across the package has a consistent appearance.
+
+Documentation Strings
+~~~~~~~~~~~~~~~~~~~~~
+
+Documentation strings (docstrings) follow the Docstring Standard from the
+`numpy Style guide <https://numpydoc.readthedocs.io/en/latest/format.html>`_.
+This standard describes how the content of docstrings is organized.
+Docstring are written using `reStructuredText
+<http://docutils.sourceforge.net/rst.html>`_ markup syntax and are rendered
+into documentation using `Sphinx <https://www.sphinx-doc.org/>`_.
+
+Function Annotations/Type Hints
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+`Functions annotations <https://peps.python.org/pep-3107/>`_ are used to
+document the types of function’s parameters and return values.
+This enables users of the package to use external tools like `mypy
+<https://mypy.readthedocs.io/en/stable/>`_ to help ensure that they're
+using the package properly.
+Python's `typing module <https://peps.python.org/pep-0484/>`_ is used to
+support the annotations.
+
+Testing
+-------
+
+Testing is performed with continuous integration using `github actions
+<https://github.com/features/actions>`_.
+Testing is performed with python versions 3.7 through 3.11.
+Testing consists of the following:
+
+- Run the source code through `black <https://black.readthedocs.io/>`_ and
+  `isort <https://pycqa.github.io/isort/>`_ to verify that the desired code
+  formatting is adhered to.
+- Run the source code through `flake8 <https://flake8.pycqa.org/>`_, which
+  analyzes the code and detects various errors.
+- Run the source code through `mypy
+  <https://mypy.readthedocs.io/en/stable/>`_, to ensure that variable types
+  are used appropriately throughout the package.
+- Run unit tests, located in the `tests` subdirectory. The unit tests include
+  creating catalogs from internally generated input files and verifying that
+  the generated catalogs match baseline catalogs that are included in the
+  repository.
diff --git a/docs/source/esm_catalog_background.rst b/docs/source/esm_catalog_background.rst
@@ -0,0 +1,33 @@
+==================================
+ESM Catalog Background Information
+==================================
+
+A simplified view of ESM catalogs is that they consist of the paths of
+ESM output files, metadata about these files, e.g., names of data variables
+in the files and date ranges covered, and metadata about how the data files
+can be aggregated together.
+
+More generally, data files can reside in the cloud, in which case `URIs
+<https://en.wikipedia.org/wiki/Uniform_Resource_Identifier>`_ are used
+instead of paths, and data files can be in a format where their content is
+spread across multiple files, e.g., :std:doc:`zarr <zarr:index>`.
+In the following, ESM output is referred to as assets, to recognize these
+generalizations.
+
+Metadata about the assets referred to by an ESM catalog (paths or URIs,
+data variable names, date ranges, etc.) is stored in memory in a
+:py:mod:`pandas` :py:class:`~pandas.DataFrame` object, and on disk in a
+comma-separated values (CSV) file.
+
+The primary data structure in :std:doc:`intake-esm <intake-esm:index>`
+to support ESM catalogs is the :std:doc:`esm_datastore
+<intake-esm:reference/api>` class.
+Loosely speaking, this class consists of an
+:std:doc:`intake-esm:reference/esm-catalog-spec` and functions that operate
+on class objects.
+The :std:doc:`intake-esm:reference/esm-catalog-spec` consists of a
+dictionary of asset metadata that is available, i.e., columns in the
+above-mentioned CSV file, metadata about how the assets can be aggregated
+together, and some other metadata, such as a description of the catalog.
+The metadata regarding aggregation is stored in an `aggregation control object
+<https://intake-esm.readthedocs.io/en/stable/reference/esm-catalog-spec.html#aggregation-control-object>`_.
diff --git a/docs/source/general_usage.rst b/docs/source/general_usage.rst
@@ -0,0 +1,133 @@
+=============
+General Usage
+=============
+
+Creating a Catalog
+------------------
+
+Catalogs, i.e. :std:doc:`esm_datastore <intake-esm:reference/api>` objects,
+are created in :mod:`esm_catalog_utils` from a casename and a list of
+directories containing model output.
+The casename and list of directories are stored in a dictionary with
+keys ``case`` and ``output_dirs`` respectively.
+We refer to this dictionary as ``case_metadata``.
+The function :func:`~esm_catalog_utils.case_metadata_to_esm_datastore`
+takes a *case_metadata* argument and returns a :std:doc:`esm_datastore
+<intake-esm:reference/api>` object for the output files in ``output_dirs``
+and its subdirectories.
+Additional arguments are described in its :func:`API documentation
+<esm_catalog_utils.case_metadata_to_esm_datastore>`.
+
+:mod:`esm_catalog_utils` also provides helper functions that generates
+the ``case_metadata`` dictionary in particular use cases, calls :func:`~esm_catalog_utils.case_metadata_to_esm_datastore`, and
+returns the result.
+
+:func:`~esm_catalog_utils.directory_to_esm_datastore` is a helper function
+for the use case of having model output in a single top-level directory
+and its subdirectories.
+The *dir* argument of :func:`~esm_catalog_utils.directory_to_esm_datastore`
+is the top-level directory where the model output is located.
+The casename can be either passed as the *case* argument to 
+:func:`~esm_catalog_utils.directory_to_esm_datastore`
+or inferred from the basename of *dir*.
+
+:func:`~esm_catalog_utils.caseroot_to_esm_datastore` is a helper function
+that takes a *caseroot* argument.
+It determines the ``case_metadata``, the casename and location of the model
+output, from the xml files in *caseroot*.
+
+Additional arguments to these helper functions are passed through to
+:func:`~esm_catalog_utils.case_metadata_to_esm_datastore`.
+Example usage of these helper funcions is provided in the
+:ref:`notebooks`.
+
+Parallelization
+~~~~~~~~~~~~~~~
+
+Extracting the metadata from model output files, such as the data variable
+names and date ranges, involves opening the files and examining the file's
+metadata.
+For long runs, there can tens of thousands of native model history files.
+Opening all of these files and examining their metadata can take a
+considerable amount of time.
+In order to speed up this process,
+:func:`~esm_catalog_utils.case_metadata_to_esm_datastore` can use
+:std:doc:`dask:index` to accelerate this embarrassingly parallel task.
+If the *use_dask* argument to
+:func:`~esm_catalog_utils.case_metadata_to_esm_datastore` is ``True``, then
+it will wrap the file open and query operations inside
+:std:doc:`dask:index` :py:class:`~dask.delayed.Delayed` objects and execute
+them in parallel.
+
+This should only be done if
+:func:`~esm_catalog_utils.case_metadata_to_esm_datastore` is called after
+instantiating a :std:doc:`dask.distributed:index`
+:py:class:`~distributed.Client`, as otherwise an error may be raised.
+The default value for *use_dask* is ``False``.
+
+The *use_dask* argument can also be passed to the helper functions
+:func:`~esm_catalog_utils.directory_to_esm_datastore` and
+:func:`~esm_catalog_utils.caseroot_to_esm_datastore`, and it will be passed
+through to :func:`~esm_catalog_utils.case_metadata_to_esm_datastore`.
+
+Writing and Reading a Catalog
+-----------------------------
+
+:std:doc:`esm_datastore <intake-esm:reference/api>` objects can be written
+to disk using the object's :func:`serialize` method, which is documented in
+the intake-esm :std:doc:`intake-esm:reference/api`.
+The resulting files can be read using :func:`intake.open_esm_datastore`.
+Example usage of these methods and functions is provided in the
+:ref:`notebooks`.
+
+Updating a Catalog
+------------------
+
+Even with the parallel speed-up provided by *use_dask*, generating a
+catalog for a long run takes a non-trivial amount of time.
+A use case for analysis of ESM output that regularly occurs, particularly
+during a development cycle, is to analyze a run, extend the run, and
+analyze the extended run.
+:func:`~esm_catalog_utils.case_metadata_to_esm_datastore` has an argument
+named *esm_datastore_in* to accelerate this use case.
+If this argument is passed,
+:func:`~esm_catalog_utils.case_metadata_to_esm_datastore` will return an
+:py:class:`esm_datastore` object with entries appended to
+*esm_datastore_in*.
+The paths determined from the *case_metadata* argument to
+:func:`~esm_catalog_utils.case_metadata_to_esm_datastore` are checked for
+existence in *esm_datastore_in*'s DataFrame ``df``.
+If the path is present in ``df`` and the file's size differs from its size
+in *esm_datastore_in*, then the entry for that path is recreated.
+If the file's size is the same as its size in *esm_datastore_in*,
+then that file's catalog entry is propagated without reopening the file
+and querying its metadata.
+Because checking a file's size is much faster than this metadata query,
+this option provides a considerable speed-up in this use case.
+
+The *esm_datastore_in* argument can also be passed to the helper functions
+:func:`~esm_catalog_utils.directory_to_esm_datastore` and
+:func:`~esm_catalog_utils.caseroot_to_esm_datastore`, and it will be passed
+through to :func:`~esm_catalog_utils.case_metadata_to_esm_datastore`.
+
+Example usage of the *esm_datastore_in* is provided in the
+:ref:`notebooks`.
+
+Catalog Issues Specific to History Files
+----------------------------------------
+In some model analysis use cases, the model output being analyzed has been
+post-processed into files that have a single data variable per file.
+In contrast, native model history file output, the files written directly
+by ESMs, typically has multiple data variables per file.
+In this use case, the `varname` column of the CSV file component of the
+ESM catalog is a list.
+Additional steps are necessary to properly parse such files when calling
+:func:`intake.open_esm_datastore`.
+As described in the :std:doc:`intake-esm documentation
+<intake-esm:how-to/use-catalogs-with-assets-containing-multiple-variables>`,
+one approach to handle this use case is to pass the value
+``{"converters": {"varname": ast.literal_eval}}`` to the *read_csv_kwargs*
+argument of :func:`intake.open_esm_datastore` when
+reading the catalog.
+This is demonstrated in the :doc:`history file example notebook
+<notebooks/ex1_caseroot_hist>`.