Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEP: Deepdish to h5py #60

Merged
merged 19 commits into from
Mar 23, 2023
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
__pycache__/
*.py[cod]
*$py.class
*.DS_Store

# C extensions
*.so
Expand Down
207 changes: 204 additions & 3 deletions pydarnio/borealis/base_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@
"""

import copy
import h5py
import numpy as np

from collections import OrderedDict
from datetime import datetime
from typing import Callable, List
import h5py

from pydarnio import borealis_exceptions

Expand Down Expand Up @@ -1102,15 +1102,15 @@ class methods used inside this method should be specific
datatype = cls.single_element_types()[field]
else: # field in array_dtypes
datatype = cls.array_dtypes()[field]
if datatype == np.unicode_:
if datatype == str:
# unicode type needs to be explicitly set to have
# multiple chars (256)
datatype='|U256'
empty_array = np.empty(array_dims, dtype=datatype)
# Some indices may not be filled due to dimensions that are maximum values (num_sequences, etc. can change
# between records), so they are initialized with a known value first.
# Initialize floating-point values to NaN, and integer values to -1.
if datatype is np.int64 or datatype is np.uint32:
if datatype in [np.int64, np.uint32, np.uint8]:
empty_array[:] = -1
else:
empty_array[:] = np.NaN
Expand Down Expand Up @@ -1229,6 +1229,207 @@ class methods used inside this method should be specific

return timestamp_dict

@classmethod
def read_records(cls, filename: str) -> OrderedDict:
"""
Base function for reading in a Borealis site file.

Parameters
----------
filename: str
Name of the file to load records from

Returns
-------
OrderedDict
a dict of timestamped records loaded from an hdf5 Borealis site file

Raises
------
OSError: file does not exist

Notes
-----
The results will differ based on the format class, as many of the
class methods used inside this method should be specific
to the format and updated in the child class.
"""
records = OrderedDict()
with h5py.File(filename, 'r') as f:
record_keys = sorted(list(f.keys()))
for rec_key in record_keys:
rec_dict = {}
group = f[rec_key]

# Get the datasets (vector fields)
datasets = list(group.keys())
for dset_name in datasets:
dset = group[dset_name]
if 'strtype' in dset.attrs: # string type, requires some handling
itemsize = dset.attrs['itemsize']
data = dset[:].view(dtype=(np.unicode_, itemsize))
else:
data = dset[:] # non-string, can simply load
rec_dict[dset_name] = data

# Get the attributes (scalar fields)
attribute_dict = {}
for k, v in group.attrs.items():
if k in ['CLASS', 'TITLE', 'VERSION', 'DEEPDISH_IO_VERSION', 'PYTABLES_FORMAT_VERSION']:
continue
elif isinstance(v, bytes):
attribute_dict[k] = v.tobytes().decode('utf-8')
elif isinstance(v, h5py.Empty):
dtype = v.dtype.type
data = dtype()
if isinstance(data, bytes):
data = data.decode('utf-8')
attribute_dict[k] = data
else:
attribute_dict[k] = v
rec_dict.update(attribute_dict)

records[rec_key] = rec_dict

return records

@classmethod
def read_arrays(cls, filename: str) -> OrderedDict:
"""
Base function for reading in a Borealis array file.

Parameters
----------
filename: str
Name of the file to load arrays from

Returns
-------
OrderedDict
a dict of arrays loaded from an hdf5 Borealis array file

Raises
------
OSError: file does not exist

Notes
-----
The results will differ based on the format class, as many of the
class methods used inside this method should be specific
to the format and updated in the child class.
"""
arrays = OrderedDict()
with h5py.File(filename, 'r') as f:

# Get the datasets (vector fields)
array_names = sorted(list(f.keys()))
for array_name in array_names:
dset = f[array_name]
if 'strtype' in dset.attrs: # string type, requires some handling
itemsize = dset.attrs['itemsize']
data = dset[:].view(dtype=(np.unicode_, itemsize))
else:
data = dset[:] # non-string, can simply load
arrays[array_name] = data

# Get the attributes (scalar fields)
attribute_dict = {}
for k, v in f.attrs.items():
if k in ['CLASS', 'TITLE', 'VERSION', 'DEEPDISH_IO_VERSION', 'PYTABLES_FORMAT_VERSION']:
continue
elif isinstance(v, bytes):
attribute_dict[k] = v.tobytes().decode('utf-8')
elif isinstance(v, h5py.Empty):
dtype = v.dtype.type
data = dtype()
if isinstance(data, bytes):
data = data.decode('utf-8')
attribute_dict[k] = data
else:
attribute_dict[k] = v
arrays.update(attribute_dict)

return arrays

@classmethod
def write_records(cls, filename: str, records: OrderedDict, attribute_types: dict,
dataset_types: dict, compression: str):
"""
Write the file in site style after checking records.

Several Borealis field checks are done to ensure the integrity of the
file.

Parameters
----------
filename: str
Name of the file to write to.
records: OrderedDict
Dictionary containing site-formatted fields to write to file.
attribute_types: dict
Dictionary with the required types for the attributes in the file.
dataset_types: dict
Dictionary with the require dtypes for the numpy arrays in the
file.
compression: str
Type of compression to use for the HDF5 file.
"""
with h5py.File(filename, 'a') as f:
for group_name, group_dict in records.items():
group = f.create_group(str(group_name))
for k, v in group_dict.items():
if k in attribute_types.keys():
if isinstance(v, str):
group.attrs[k] = np.bytes_(v)
else:
group.attrs[k] = v
elif v.dtype.type == np.str_:
itemsize = v.dtype.itemsize // 4 # every character is 4 bytes
dset = group.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=compression)
dset.attrs['strtype'] = b'unicode'
dset.attrs['itemsize'] = itemsize
else:
group.create_dataset(k, data=v, compression=compression)

@classmethod
def write_arrays(cls, filename: str, arrays: OrderedDict, attribute_types: dict,
dataset_types: dict, unshared_fields: List[str], compression: str):
"""
Write arrays to file while checking all data fields.

Parameters
----------
filename: str
Name of the file to write to.
arrays: OrderedDict
Dictionary containing array-formatted fields to write to file.
attribute_types: dict
Dictionary with the required types for the attributes in the file.
dataset_types: dict
Dictionary with the require dtypes for the numpy arrays in the
file.
unshared_fields: List[str]
List of fields that are not shared between the records and
therefore should be an array with first dimension = number of
records
compression: str
Type of compression to use for the HDF5 file.
"""
with h5py.File(filename, 'a') as f:
for k, v in arrays.items():
if k in attribute_types:
if isinstance(v, str):
f.attrs[k] = np.bytes_(v)
else:
f.attrs[k] = v
elif v.dtype.type == np.str_:
itemsize = v.dtype.itemsize // 4 # every character is 4 bytes
dset = f.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=compression)
dset.attrs['strtype'] = b'unicode'
dset.attrs['itemsize'] = itemsize
else:
f.create_dataset(k, data=v, compression=compression)

# STATIC METHODS COMMON ACROSS FORMATS
# i.e. common methods that can be used by multiple formats in restructuring
# (generally these will be used in the unshared fields dims for arrays)
Expand Down
106 changes: 21 additions & 85 deletions pydarnio/borealis/borealis_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,9 @@
For more information on Borealis data files and how they convert to SDarn
files, see: https://borealis.readthedocs.io/en/latest/
"""
import deepdish as dd
import h5py
import logging
import numpy as np
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Numpy not used in file?


from typing import List

Expand Down Expand Up @@ -115,10 +116,10 @@ def __init__(self, filename: str, borealis_filetype: str):
# get the version of the file - split by the dash, first part should be
# 'vX.X'
try:
version = dd.io.load(self.filename,
group='/borealis_git_hash').split('-')[0]
version = '.'.join(version.split('.')[:2]) # vX.Y, ignore patch revision
except ValueError as err:
with h5py.File(self.filename, 'r') as f:
full_version = f.attrs['borealis_git_hash'].decode('utf-8').split('-')[0]
version = '.'.join(full_version.split('.')[:2]) # vX.Y, ignore patch revision
except KeyError as err:
raise borealis_exceptions.BorealisStructureError(
' {} Could not find the borealis_git_hash required to '
'determine read version (file may be site style) {}'
Expand Down Expand Up @@ -242,49 +243,14 @@ def read_file(self) -> dict:
dataset_types = self.format.array_array_dtypes()
unshared_fields = self.format.unshared_fields()

self._read_borealis_arrays(attribute_types, dataset_types,
unshared_fields)
return self._arrays

def _read_borealis_arrays(self, attribute_types: dict,
dataset_types: dict,
unshared_fields: List[str]):
"""
Read the entire file while checking all data fields.

Parameters
----------
attribute_types: dict
Dictionary with the required types for the attributes in the file.
dataset_types: dict
Dictionary with the require dtypes for the numpy arrays in the
file.
unshared_fields: List[str]
List of fields that are not shared between the records and
therefore should be an array with first dimension = number of
records

Raises
------
BorealisFieldMissingError - when a field is missing from the Borealis
file
BorealisExtraFieldError - when an extra field is present in the
Borealis file
BorealisDataFormatTypeError - when a field has the incorrect
field type for the Borealis file
BorealisNumberOfRecordsError - when the number of records cannot
be discerned from the arrays

See Also
--------
BorealisUtilities
"""
arrays = dd.io.load(self.filename)
arrays = self.format.read_arrays(self.filename)
BorealisUtilities.check_arrays(self.filename, arrays,
attribute_types, dataset_types,
unshared_fields)
self._arrays = arrays

return self._arrays


class BorealisArrayWrite():
"""
Expand Down Expand Up @@ -465,7 +431,14 @@ def write_file(self) -> str:

Raises
------
BorealisFileTypeError
BorealisFieldMissingError - when a field is missing from the Borealis
file
BorealisExtraFieldError - when an extra field is present in the
Borealis file
BorealisDataFormatTypeError - when a field has the incorrect
field type for the Borealis file
BorealisNumberOfRecordsError - when the number of records cannot
be discerned from the arrays

See Also
--------
Expand All @@ -479,45 +452,8 @@ def write_file(self) -> str:
attribute_types = self.format.array_single_element_types()
dataset_types = self.format.array_array_dtypes()
unshared_fields = self.format.unshared_fields()

self._write_borealis_arrays(attribute_types, dataset_types,
unshared_fields)
BorealisUtilities.check_arrays(self.filename, self.arrays, attribute_types,
dataset_types, unshared_fields)
self.format.write_arrays(self.filename, self.arrays, attribute_types,
dataset_types, unshared_fields, self.compression)
return self.filename

def _write_borealis_arrays(self, attribute_types: dict,
dataset_types: dict,
unshared_fields: List[str]):
"""
Write the entire file while checking all data fields.

Parameters
----------
attribute_types: dict
Dictionary with the required types for the attributes in the file.
dataset_types: dict
Dictionary with the require dtypes for the numpy arrays in the
file.
unshared_fields: List[str]
List of fields that are not shared between the records and
therefore should be an array with first dimension = number of
records

Raises
------
BorealisFieldMissingError - when a field is missing from the Borealis
file
BorealisExtraFieldError - when an extra field is present in the
Borealis file
BorealisDataFormatTypeError - when a field has the incorrect
field type for the Borealis file
BorealisNumberOfRecordsError - when the number of records cannot
be discerned from the arrays

See Also
--------
BorealisUtilities
"""
BorealisUtilities.check_arrays(self.filename, self.arrays,
attribute_types, dataset_types,
unshared_fields)
dd.io.save(self.filename, self.arrays, compression=self.compression)
Loading