Skip to content

Commit

Permalink
DEP: Deepdish to h5py (#60)
Browse files Browse the repository at this point in the history
Use h5py for all HDF5 file I/O. The deepdish package is no longer maintained, and h5py is managed by the HDF5 group so is not at risk of deprecation.
  • Loading branch information
carleyjmartin authored Mar 23, 2023
1 parent 18e0069 commit 36aa7a3
Show file tree
Hide file tree
Showing 10 changed files with 435 additions and 345 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
__pycache__/
*.py[cod]
*$py.class
*.DS_Store

# C extensions
*.so
Expand Down
207 changes: 204 additions & 3 deletions pydarnio/borealis/base_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@
"""

import copy
import h5py
import numpy as np

from collections import OrderedDict
from datetime import datetime
from typing import Callable, List
import h5py

from pydarnio import borealis_exceptions

Expand Down Expand Up @@ -1102,15 +1102,15 @@ class methods used inside this method should be specific
datatype = cls.single_element_types()[field]
else: # field in array_dtypes
datatype = cls.array_dtypes()[field]
if datatype == np.unicode_:
if datatype == str:
# unicode type needs to be explicitly set to have
# multiple chars (256)
datatype='|U256'
empty_array = np.empty(array_dims, dtype=datatype)
# Some indices may not be filled due to dimensions that are maximum values (num_sequences, etc. can change
# between records), so they are initialized with a known value first.
# Initialize floating-point values to NaN, and integer values to -1.
if datatype is np.int64 or datatype is np.uint32:
if datatype in [np.int64, np.uint32, np.uint8]:
empty_array[:] = -1
else:
empty_array[:] = np.NaN
Expand Down Expand Up @@ -1229,6 +1229,207 @@ class methods used inside this method should be specific

return timestamp_dict

@classmethod
def read_records(cls, filename: str) -> OrderedDict:
"""
Base function for reading in a Borealis site file.
Parameters
----------
filename: str
Name of the file to load records from
Returns
-------
OrderedDict
a dict of timestamped records loaded from an hdf5 Borealis site file
Raises
------
OSError: file does not exist
Notes
-----
The results will differ based on the format class, as many of the
class methods used inside this method should be specific
to the format and updated in the child class.
"""
records = OrderedDict()
with h5py.File(filename, 'r') as f:
record_keys = sorted(list(f.keys()))
for rec_key in record_keys:
rec_dict = {}
group = f[rec_key]

# Get the datasets (vector fields)
datasets = list(group.keys())
for dset_name in datasets:
dset = group[dset_name]
if 'strtype' in dset.attrs: # string type, requires some handling
itemsize = dset.attrs['itemsize']
data = dset[:].view(dtype=(np.unicode_, itemsize))
else:
data = dset[:] # non-string, can simply load
rec_dict[dset_name] = data

# Get the attributes (scalar fields)
attribute_dict = {}
for k, v in group.attrs.items():
if k in ['CLASS', 'TITLE', 'VERSION', 'DEEPDISH_IO_VERSION', 'PYTABLES_FORMAT_VERSION']:
continue
elif isinstance(v, bytes):
attribute_dict[k] = v.tobytes().decode('utf-8')
elif isinstance(v, h5py.Empty):
dtype = v.dtype.type
data = dtype()
if isinstance(data, bytes):
data = data.decode('utf-8')
attribute_dict[k] = data
else:
attribute_dict[k] = v
rec_dict.update(attribute_dict)

records[rec_key] = rec_dict

return records

@classmethod
def read_arrays(cls, filename: str) -> OrderedDict:
"""
Base function for reading in a Borealis array file.
Parameters
----------
filename: str
Name of the file to load arrays from
Returns
-------
OrderedDict
a dict of arrays loaded from an hdf5 Borealis array file
Raises
------
OSError: file does not exist
Notes
-----
The results will differ based on the format class, as many of the
class methods used inside this method should be specific
to the format and updated in the child class.
"""
arrays = OrderedDict()
with h5py.File(filename, 'r') as f:

# Get the datasets (vector fields)
array_names = sorted(list(f.keys()))
for array_name in array_names:
dset = f[array_name]
if 'strtype' in dset.attrs: # string type, requires some handling
itemsize = dset.attrs['itemsize']
data = dset[:].view(dtype=(np.unicode_, itemsize))
else:
data = dset[:] # non-string, can simply load
arrays[array_name] = data

# Get the attributes (scalar fields)
attribute_dict = {}
for k, v in f.attrs.items():
if k in ['CLASS', 'TITLE', 'VERSION', 'DEEPDISH_IO_VERSION', 'PYTABLES_FORMAT_VERSION']:
continue
elif isinstance(v, bytes):
attribute_dict[k] = v.tobytes().decode('utf-8')
elif isinstance(v, h5py.Empty):
dtype = v.dtype.type
data = dtype()
if isinstance(data, bytes):
data = data.decode('utf-8')
attribute_dict[k] = data
else:
attribute_dict[k] = v
arrays.update(attribute_dict)

return arrays

@classmethod
def write_records(cls, filename: str, records: OrderedDict, attribute_types: dict,
dataset_types: dict, compression: str):
"""
Write the file in site style after checking records.
Several Borealis field checks are done to ensure the integrity of the
file.
Parameters
----------
filename: str
Name of the file to write to.
records: OrderedDict
Dictionary containing site-formatted fields to write to file.
attribute_types: dict
Dictionary with the required types for the attributes in the file.
dataset_types: dict
Dictionary with the require dtypes for the numpy arrays in the
file.
compression: str
Type of compression to use for the HDF5 file.
"""
with h5py.File(filename, 'a') as f:
for group_name, group_dict in records.items():
group = f.create_group(str(group_name))
for k, v in group_dict.items():
if k in attribute_types.keys():
if isinstance(v, str):
group.attrs[k] = np.bytes_(v)
else:
group.attrs[k] = v
elif v.dtype.type == np.str_:
itemsize = v.dtype.itemsize // 4 # every character is 4 bytes
dset = group.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=compression)
dset.attrs['strtype'] = b'unicode'
dset.attrs['itemsize'] = itemsize
else:
group.create_dataset(k, data=v, compression=compression)

@classmethod
def write_arrays(cls, filename: str, arrays: OrderedDict, attribute_types: dict,
dataset_types: dict, unshared_fields: List[str], compression: str):
"""
Write arrays to file while checking all data fields.
Parameters
----------
filename: str
Name of the file to write to.
arrays: OrderedDict
Dictionary containing array-formatted fields to write to file.
attribute_types: dict
Dictionary with the required types for the attributes in the file.
dataset_types: dict
Dictionary with the require dtypes for the numpy arrays in the
file.
unshared_fields: List[str]
List of fields that are not shared between the records and
therefore should be an array with first dimension = number of
records
compression: str
Type of compression to use for the HDF5 file.
"""
with h5py.File(filename, 'a') as f:
for k, v in arrays.items():
if k in attribute_types:
if isinstance(v, str):
f.attrs[k] = np.bytes_(v)
else:
f.attrs[k] = v
elif v.dtype.type == np.str_:
itemsize = v.dtype.itemsize // 4 # every character is 4 bytes
dset = f.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=compression)
dset.attrs['strtype'] = b'unicode'
dset.attrs['itemsize'] = itemsize
else:
f.create_dataset(k, data=v, compression=compression)

# STATIC METHODS COMMON ACROSS FORMATS
# i.e. common methods that can be used by multiple formats in restructuring
# (generally these will be used in the unshared fields dims for arrays)
Expand Down
106 changes: 21 additions & 85 deletions pydarnio/borealis/borealis_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,9 @@
For more information on Borealis data files and how they convert to SDarn
files, see: https://borealis.readthedocs.io/en/latest/
"""
import deepdish as dd
import h5py
import logging
import numpy as np

from typing import List

Expand Down Expand Up @@ -115,10 +116,10 @@ def __init__(self, filename: str, borealis_filetype: str):
# get the version of the file - split by the dash, first part should be
# 'vX.X'
try:
version = dd.io.load(self.filename,
group='/borealis_git_hash').split('-')[0]
version = '.'.join(version.split('.')[:2]) # vX.Y, ignore patch revision
except ValueError as err:
with h5py.File(self.filename, 'r') as f:
full_version = f.attrs['borealis_git_hash'].decode('utf-8').split('-')[0]
version = '.'.join(full_version.split('.')[:2]) # vX.Y, ignore patch revision
except KeyError as err:
raise borealis_exceptions.BorealisStructureError(
' {} Could not find the borealis_git_hash required to '
'determine read version (file may be site style) {}'
Expand Down Expand Up @@ -242,49 +243,14 @@ def read_file(self) -> dict:
dataset_types = self.format.array_array_dtypes()
unshared_fields = self.format.unshared_fields()

self._read_borealis_arrays(attribute_types, dataset_types,
unshared_fields)
return self._arrays

def _read_borealis_arrays(self, attribute_types: dict,
dataset_types: dict,
unshared_fields: List[str]):
"""
Read the entire file while checking all data fields.
Parameters
----------
attribute_types: dict
Dictionary with the required types for the attributes in the file.
dataset_types: dict
Dictionary with the require dtypes for the numpy arrays in the
file.
unshared_fields: List[str]
List of fields that are not shared between the records and
therefore should be an array with first dimension = number of
records
Raises
------
BorealisFieldMissingError - when a field is missing from the Borealis
file
BorealisExtraFieldError - when an extra field is present in the
Borealis file
BorealisDataFormatTypeError - when a field has the incorrect
field type for the Borealis file
BorealisNumberOfRecordsError - when the number of records cannot
be discerned from the arrays
See Also
--------
BorealisUtilities
"""
arrays = dd.io.load(self.filename)
arrays = self.format.read_arrays(self.filename)
BorealisUtilities.check_arrays(self.filename, arrays,
attribute_types, dataset_types,
unshared_fields)
self._arrays = arrays

return self._arrays


class BorealisArrayWrite():
"""
Expand Down Expand Up @@ -465,7 +431,14 @@ def write_file(self) -> str:
Raises
------
BorealisFileTypeError
BorealisFieldMissingError - when a field is missing from the Borealis
file
BorealisExtraFieldError - when an extra field is present in the
Borealis file
BorealisDataFormatTypeError - when a field has the incorrect
field type for the Borealis file
BorealisNumberOfRecordsError - when the number of records cannot
be discerned from the arrays
See Also
--------
Expand All @@ -479,45 +452,8 @@ def write_file(self) -> str:
attribute_types = self.format.array_single_element_types()
dataset_types = self.format.array_array_dtypes()
unshared_fields = self.format.unshared_fields()

self._write_borealis_arrays(attribute_types, dataset_types,
unshared_fields)
BorealisUtilities.check_arrays(self.filename, self.arrays, attribute_types,
dataset_types, unshared_fields)
self.format.write_arrays(self.filename, self.arrays, attribute_types,
dataset_types, unshared_fields, self.compression)
return self.filename

def _write_borealis_arrays(self, attribute_types: dict,
dataset_types: dict,
unshared_fields: List[str]):
"""
Write the entire file while checking all data fields.
Parameters
----------
attribute_types: dict
Dictionary with the required types for the attributes in the file.
dataset_types: dict
Dictionary with the require dtypes for the numpy arrays in the
file.
unshared_fields: List[str]
List of fields that are not shared between the records and
therefore should be an array with first dimension = number of
records
Raises
------
BorealisFieldMissingError - when a field is missing from the Borealis
file
BorealisExtraFieldError - when an extra field is present in the
Borealis file
BorealisDataFormatTypeError - when a field has the incorrect
field type for the Borealis file
BorealisNumberOfRecordsError - when the number of records cannot
be discerned from the arrays
See Also
--------
BorealisUtilities
"""
BorealisUtilities.check_arrays(self.filename, self.arrays,
attribute_types, dataset_types,
unshared_fields)
dd.io.save(self.filename, self.arrays, compression=self.compression)
Loading

0 comments on commit 36aa7a3

Please sign in to comment.