DEP: Deepdish to h5py (#60)

Use h5py for all HDF5 file I/O. The deepdish package is no longer maintained, and h5py is managed by the HDF5 group so is not at risk of deprecation.
SuperDARN · Mar 23, 2023 · 36aa7a3 · 36aa7a3
1 parent 18e0069
commit 36aa7a3
Show file tree

Hide file tree

Showing 10 changed files with 435 additions and 345 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 __pycache__/
 *.py[cod]
 *$py.class
+*.DS_Store
 
 # C extensions
 *.so

diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py
@@ -37,12 +37,12 @@
 """
 
 import copy
+import h5py
 import numpy as np
 
 from collections import OrderedDict
 from datetime import datetime
 from typing import Callable, List
-import h5py
 
 from pydarnio import borealis_exceptions
 
@@ -1102,15 +1102,15 @@ class methods used inside this method should be specific
                 datatype = cls.single_element_types()[field]
             else:  # field in array_dtypes
                 datatype = cls.array_dtypes()[field]
-            if datatype == np.unicode_:
+            if datatype == str:
                 # unicode type needs to be explicitly set to have
                 # multiple chars (256)
                 datatype='|U256'
             empty_array = np.empty(array_dims, dtype=datatype)
             # Some indices may not be filled due to dimensions that are maximum values (num_sequences, etc. can change
             # between records), so they are initialized with a known value first.
             # Initialize floating-point values to NaN, and integer values to -1.
-            if datatype is np.int64 or datatype is np.uint32:
+            if datatype in [np.int64, np.uint32, np.uint8]:
                 empty_array[:] = -1
             else:
                 empty_array[:] = np.NaN
@@ -1229,6 +1229,207 @@ class methods used inside this method should be specific
 
         return timestamp_dict
 
+    @classmethod
+    def read_records(cls, filename: str) -> OrderedDict:
+        """
+        Base function for reading in a Borealis site file.
+
+        Parameters
+        ----------
+        filename: str
+            Name of the file to load records from
+
+        Returns
+        -------
+        OrderedDict
+            a dict of timestamped records loaded from an hdf5 Borealis site file
+
+        Raises
+        ------
+        OSError: file does not exist
+
+        Notes
+        -----
+        The results will differ based on the format class, as many of the
+        class methods used inside this method should be specific
+        to the format and updated in the child class.
+        """
+        records = OrderedDict()
+        with h5py.File(filename, 'r') as f:
+            record_keys = sorted(list(f.keys()))
+            for rec_key in record_keys:
+                rec_dict = {}
+                group = f[rec_key]
+
+                # Get the datasets (vector fields)
+                datasets = list(group.keys())
+                for dset_name in datasets:
+                    dset = group[dset_name]
+                    if 'strtype' in dset.attrs:     # string type, requires some handling
+                        itemsize = dset.attrs['itemsize']
+                        data = dset[:].view(dtype=(np.unicode_, itemsize))
+                    else:
+                        data = dset[:]      # non-string, can simply load
+                    rec_dict[dset_name] = data
+
+                # Get the attributes (scalar fields)
+                attribute_dict = {}
+                for k, v in group.attrs.items():
+                    if k in ['CLASS', 'TITLE', 'VERSION', 'DEEPDISH_IO_VERSION', 'PYTABLES_FORMAT_VERSION']:
+                        continue
+                    elif isinstance(v, bytes):
+                        attribute_dict[k] = v.tobytes().decode('utf-8')
+                    elif isinstance(v, h5py.Empty):
+                        dtype = v.dtype.type
+                        data = dtype()
+                        if isinstance(data, bytes):
+                            data = data.decode('utf-8')
+                        attribute_dict[k] = data
+                    else:
+                        attribute_dict[k] = v
+                rec_dict.update(attribute_dict)
+
+                records[rec_key] = rec_dict
+
+        return records
+
+    @classmethod
+    def read_arrays(cls, filename: str) -> OrderedDict:
+        """
+        Base function for reading in a Borealis array file.
+
+        Parameters
+        ----------
+        filename: str
+            Name of the file to load arrays from
+
+        Returns
+        -------
+        OrderedDict
+            a dict of arrays loaded from an hdf5 Borealis array file
+
+        Raises
+        ------
+        OSError: file does not exist
+
+        Notes
+        -----
+        The results will differ based on the format class, as many of the
+        class methods used inside this method should be specific
+        to the format and updated in the child class.
+        """
+        arrays = OrderedDict()
+        with h5py.File(filename, 'r') as f:
+
+            # Get the datasets (vector fields)
+            array_names = sorted(list(f.keys()))
+            for array_name in array_names:
+                dset = f[array_name]
+                if 'strtype' in dset.attrs:  # string type, requires some handling
+                    itemsize = dset.attrs['itemsize']
+                    data = dset[:].view(dtype=(np.unicode_, itemsize))
+                else:
+                    data = dset[:]  # non-string, can simply load
+                arrays[array_name] = data
+
+            # Get the attributes (scalar fields)
+            attribute_dict = {}
+            for k, v in f.attrs.items():
+                if k in ['CLASS', 'TITLE', 'VERSION', 'DEEPDISH_IO_VERSION', 'PYTABLES_FORMAT_VERSION']:
+                    continue
+                elif isinstance(v, bytes):
+                    attribute_dict[k] = v.tobytes().decode('utf-8')
+                elif isinstance(v, h5py.Empty):
+                    dtype = v.dtype.type
+                    data = dtype()
+                    if isinstance(data, bytes):
+                        data = data.decode('utf-8')
+                    attribute_dict[k] = data
+                else:
+                    attribute_dict[k] = v
+            arrays.update(attribute_dict)
+
+        return arrays
+
+    @classmethod
+    def write_records(cls, filename: str, records: OrderedDict, attribute_types: dict,
+                      dataset_types: dict, compression: str):
+        """
+        Write the file in site style after checking records.
+
+        Several Borealis field checks are done to ensure the integrity of the
+        file.
+
+        Parameters
+        ----------
+        filename: str
+            Name of the file to write to.
+        records: OrderedDict
+            Dictionary containing site-formatted fields to write to file.
+        attribute_types: dict
+            Dictionary with the required types for the attributes in the file.
+        dataset_types: dict
+            Dictionary with the require dtypes for the numpy arrays in the
+            file.
+        compression: str
+            Type of compression to use for the HDF5 file.
+        """
+        with h5py.File(filename, 'a') as f:
+            for group_name, group_dict in records.items():
+                group = f.create_group(str(group_name))
+                for k, v in group_dict.items():
+                    if k in attribute_types.keys():
+                        if isinstance(v, str):
+                            group.attrs[k] = np.bytes_(v)
+                        else:
+                            group.attrs[k] = v
+                    elif v.dtype.type == np.str_:
+                        itemsize = v.dtype.itemsize // 4  # every character is 4 bytes
+                        dset = group.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=compression)
+                        dset.attrs['strtype'] = b'unicode'
+                        dset.attrs['itemsize'] = itemsize
+                    else:
+                        group.create_dataset(k, data=v, compression=compression)
+
+    @classmethod
+    def write_arrays(cls, filename: str, arrays: OrderedDict, attribute_types: dict,
+                     dataset_types: dict, unshared_fields: List[str], compression: str):
+        """
+        Write arrays to file while checking all data fields.
+
+        Parameters
+        ----------
+        filename: str
+            Name of the file to write to.
+        arrays: OrderedDict
+            Dictionary containing array-formatted fields to write to file.
+        attribute_types: dict
+            Dictionary with the required types for the attributes in the file.
+        dataset_types: dict
+            Dictionary with the require dtypes for the numpy arrays in the
+            file.
+        unshared_fields: List[str]
+            List of fields that are not shared between the records and
+            therefore should be an array with first dimension = number of
+            records
+        compression: str
+            Type of compression to use for the HDF5 file.
+        """
+        with h5py.File(filename, 'a') as f:
+            for k, v in arrays.items():
+                if k in attribute_types:
+                    if isinstance(v, str):
+                        f.attrs[k] = np.bytes_(v)
+                    else:
+                        f.attrs[k] = v
+                elif v.dtype.type == np.str_:
+                    itemsize = v.dtype.itemsize // 4  # every character is 4 bytes
+                    dset = f.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=compression)
+                    dset.attrs['strtype'] = b'unicode'
+                    dset.attrs['itemsize'] = itemsize
+                else:
+                    f.create_dataset(k, data=v, compression=compression)
+
     # STATIC METHODS COMMON ACROSS FORMATS
     # i.e. common methods that can be used by multiple formats in restructuring
     # (generally these will be used in the unshared fields dims for arrays)

diff --git a/pydarnio/borealis/borealis_array.py b/pydarnio/borealis/borealis_array.py
@@ -37,8 +37,9 @@
 For more information on Borealis data files and how they convert to SDarn
 files, see: https://borealis.readthedocs.io/en/latest/
 """
-import deepdish as dd
+import h5py
 import logging
+import numpy as np
 
 from typing import List
 
@@ -115,10 +116,10 @@ def __init__(self, filename: str, borealis_filetype: str):
         # get the version of the file - split by the dash, first part should be
         # 'vX.X'
         try:
-            version = dd.io.load(self.filename,
-                                 group='/borealis_git_hash').split('-')[0]
-            version = '.'.join(version.split('.')[:2])      # vX.Y, ignore patch revision
-        except ValueError as err:
+            with h5py.File(self.filename, 'r') as f:
+                full_version = f.attrs['borealis_git_hash'].decode('utf-8').split('-')[0]
+                version = '.'.join(full_version.split('.')[:2])      # vX.Y, ignore patch revision
+        except KeyError as err:
             raise borealis_exceptions.BorealisStructureError(
                 ' {} Could not find the borealis_git_hash required to '
                 'determine read version (file may be site style) {}'
@@ -242,49 +243,14 @@ def read_file(self) -> dict:
         dataset_types = self.format.array_array_dtypes()
         unshared_fields = self.format.unshared_fields()
 
-        self._read_borealis_arrays(attribute_types, dataset_types,
-                                   unshared_fields)
-        return self._arrays
-
-    def _read_borealis_arrays(self, attribute_types: dict,
-                              dataset_types: dict,
-                              unshared_fields: List[str]):
-        """
-        Read the entire file while checking all data fields.
-
-        Parameters
-        ----------
-        attribute_types: dict
-            Dictionary with the required types for the attributes in the file.
-        dataset_types: dict
-            Dictionary with the require dtypes for the numpy arrays in the
-            file.
-        unshared_fields: List[str]
-            List of fields that are not shared between the records and
-            therefore should be an array with first dimension = number of
-            records
-
-        Raises
-        ------
-        BorealisFieldMissingError - when a field is missing from the Borealis
-                                file
-        BorealisExtraFieldError - when an extra field is present in the
-                                Borealis file
-        BorealisDataFormatTypeError - when a field has the incorrect
-                                field type for the Borealis file
-        BorealisNumberOfRecordsError - when the number of records cannot
-                                be discerned from the arrays
-
-        See Also
-        --------
-        BorealisUtilities
-        """
-        arrays = dd.io.load(self.filename)
+        arrays = self.format.read_arrays(self.filename)
         BorealisUtilities.check_arrays(self.filename, arrays,
                                        attribute_types, dataset_types,
                                        unshared_fields)
         self._arrays = arrays
 
+        return self._arrays
+
 
 class BorealisArrayWrite():
     """
@@ -465,7 +431,14 @@ def write_file(self) -> str:
 
         Raises
         ------
-        BorealisFileTypeError
+        BorealisFieldMissingError - when a field is missing from the Borealis
+                                file
+        BorealisExtraFieldError - when an extra field is present in the
+                                Borealis file
+        BorealisDataFormatTypeError - when a field has the incorrect
+                                field type for the Borealis file
+        BorealisNumberOfRecordsError - when the number of records cannot
+                                be discerned from the arrays
 
         See Also
         --------
@@ -479,45 +452,8 @@ def write_file(self) -> str:
         attribute_types = self.format.array_single_element_types()
         dataset_types = self.format.array_array_dtypes()
         unshared_fields = self.format.unshared_fields()
-
-        self._write_borealis_arrays(attribute_types, dataset_types,
-                                    unshared_fields)
+        BorealisUtilities.check_arrays(self.filename, self.arrays, attribute_types,
+                                       dataset_types, unshared_fields)
+        self.format.write_arrays(self.filename, self.arrays, attribute_types,
+                                 dataset_types, unshared_fields, self.compression)
         return self.filename
-
-    def _write_borealis_arrays(self, attribute_types: dict,
-                               dataset_types: dict,
-                               unshared_fields: List[str]):
-        """
-        Write the entire file while checking all data fields.
-
-        Parameters
-        ----------
-        attribute_types: dict
-            Dictionary with the required types for the attributes in the file.
-        dataset_types: dict
-            Dictionary with the require dtypes for the numpy arrays in the
-            file.
-        unshared_fields: List[str]
-            List of fields that are not shared between the records and
-            therefore should be an array with first dimension = number of
-            records
-
-        Raises
-        ------
-        BorealisFieldMissingError - when a field is missing from the Borealis
-                                file
-        BorealisExtraFieldError - when an extra field is present in the
-                                Borealis file
-        BorealisDataFormatTypeError - when a field has the incorrect
-                                field type for the Borealis file
-        BorealisNumberOfRecordsError - when the number of records cannot
-                                be discerned from the arrays
-
-        See Also
-        --------
-        BorealisUtilities
-        """
-        BorealisUtilities.check_arrays(self.filename, self.arrays,
-                                       attribute_types, dataset_types,
-                                       unshared_fields)
-        dd.io.save(self.filename, self.arrays, compression=self.compression)