From 36aa7a327885998a0c92bb361a21a2ec832afa7b Mon Sep 17 00:00:00 2001 From: Carley <60905856+carleyjmartin@users.noreply.github.com> Date: Thu, 23 Mar 2023 11:44:49 -0600 Subject: [PATCH] DEP: Deepdish to h5py (#60) Use h5py for all HDF5 file I/O. The deepdish package is no longer maintained, and h5py is managed by the HDF5 group so is not at risk of deprecation. --- .gitignore | 1 + pydarnio/borealis/base_format.py | 207 +++++++++++++++++++++- pydarnio/borealis/borealis_array.py | 106 +++-------- pydarnio/borealis/borealis_convert.py | 1 - pydarnio/borealis/borealis_formats.py | 78 ++++---- pydarnio/borealis/borealis_restructure.py | 194 ++++++++------------ pydarnio/borealis/borealis_site.py | 108 +++-------- pydarnio/borealis/borealis_utilities.py | 26 +-- setup.cfg | 3 +- setup.py | 56 ++++++ 10 files changed, 435 insertions(+), 345 deletions(-) create mode 100644 setup.py diff --git a/.gitignore b/.gitignore index b6e4761..ed783b6 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ __pycache__/ *.py[cod] *$py.class +*.DS_Store # C extensions *.so diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py index ca8e97d..eba9bff 100644 --- a/pydarnio/borealis/base_format.py +++ b/pydarnio/borealis/base_format.py @@ -37,12 +37,12 @@ """ import copy +import h5py import numpy as np from collections import OrderedDict from datetime import datetime from typing import Callable, List -import h5py from pydarnio import borealis_exceptions @@ -1102,7 +1102,7 @@ class methods used inside this method should be specific datatype = cls.single_element_types()[field] else: # field in array_dtypes datatype = cls.array_dtypes()[field] - if datatype == np.unicode_: + if datatype == str: # unicode type needs to be explicitly set to have # multiple chars (256) datatype='|U256' @@ -1110,7 +1110,7 @@ class methods used inside this method should be specific # Some indices may not be filled due to dimensions that are maximum values (num_sequences, etc. can change # between records), so they are initialized with a known value first. # Initialize floating-point values to NaN, and integer values to -1. - if datatype is np.int64 or datatype is np.uint32: + if datatype in [np.int64, np.uint32, np.uint8]: empty_array[:] = -1 else: empty_array[:] = np.NaN @@ -1229,6 +1229,207 @@ class methods used inside this method should be specific return timestamp_dict + @classmethod + def read_records(cls, filename: str) -> OrderedDict: + """ + Base function for reading in a Borealis site file. + + Parameters + ---------- + filename: str + Name of the file to load records from + + Returns + ------- + OrderedDict + a dict of timestamped records loaded from an hdf5 Borealis site file + + Raises + ------ + OSError: file does not exist + + Notes + ----- + The results will differ based on the format class, as many of the + class methods used inside this method should be specific + to the format and updated in the child class. + """ + records = OrderedDict() + with h5py.File(filename, 'r') as f: + record_keys = sorted(list(f.keys())) + for rec_key in record_keys: + rec_dict = {} + group = f[rec_key] + + # Get the datasets (vector fields) + datasets = list(group.keys()) + for dset_name in datasets: + dset = group[dset_name] + if 'strtype' in dset.attrs: # string type, requires some handling + itemsize = dset.attrs['itemsize'] + data = dset[:].view(dtype=(np.unicode_, itemsize)) + else: + data = dset[:] # non-string, can simply load + rec_dict[dset_name] = data + + # Get the attributes (scalar fields) + attribute_dict = {} + for k, v in group.attrs.items(): + if k in ['CLASS', 'TITLE', 'VERSION', 'DEEPDISH_IO_VERSION', 'PYTABLES_FORMAT_VERSION']: + continue + elif isinstance(v, bytes): + attribute_dict[k] = v.tobytes().decode('utf-8') + elif isinstance(v, h5py.Empty): + dtype = v.dtype.type + data = dtype() + if isinstance(data, bytes): + data = data.decode('utf-8') + attribute_dict[k] = data + else: + attribute_dict[k] = v + rec_dict.update(attribute_dict) + + records[rec_key] = rec_dict + + return records + + @classmethod + def read_arrays(cls, filename: str) -> OrderedDict: + """ + Base function for reading in a Borealis array file. + + Parameters + ---------- + filename: str + Name of the file to load arrays from + + Returns + ------- + OrderedDict + a dict of arrays loaded from an hdf5 Borealis array file + + Raises + ------ + OSError: file does not exist + + Notes + ----- + The results will differ based on the format class, as many of the + class methods used inside this method should be specific + to the format and updated in the child class. + """ + arrays = OrderedDict() + with h5py.File(filename, 'r') as f: + + # Get the datasets (vector fields) + array_names = sorted(list(f.keys())) + for array_name in array_names: + dset = f[array_name] + if 'strtype' in dset.attrs: # string type, requires some handling + itemsize = dset.attrs['itemsize'] + data = dset[:].view(dtype=(np.unicode_, itemsize)) + else: + data = dset[:] # non-string, can simply load + arrays[array_name] = data + + # Get the attributes (scalar fields) + attribute_dict = {} + for k, v in f.attrs.items(): + if k in ['CLASS', 'TITLE', 'VERSION', 'DEEPDISH_IO_VERSION', 'PYTABLES_FORMAT_VERSION']: + continue + elif isinstance(v, bytes): + attribute_dict[k] = v.tobytes().decode('utf-8') + elif isinstance(v, h5py.Empty): + dtype = v.dtype.type + data = dtype() + if isinstance(data, bytes): + data = data.decode('utf-8') + attribute_dict[k] = data + else: + attribute_dict[k] = v + arrays.update(attribute_dict) + + return arrays + + @classmethod + def write_records(cls, filename: str, records: OrderedDict, attribute_types: dict, + dataset_types: dict, compression: str): + """ + Write the file in site style after checking records. + + Several Borealis field checks are done to ensure the integrity of the + file. + + Parameters + ---------- + filename: str + Name of the file to write to. + records: OrderedDict + Dictionary containing site-formatted fields to write to file. + attribute_types: dict + Dictionary with the required types for the attributes in the file. + dataset_types: dict + Dictionary with the require dtypes for the numpy arrays in the + file. + compression: str + Type of compression to use for the HDF5 file. + """ + with h5py.File(filename, 'a') as f: + for group_name, group_dict in records.items(): + group = f.create_group(str(group_name)) + for k, v in group_dict.items(): + if k in attribute_types.keys(): + if isinstance(v, str): + group.attrs[k] = np.bytes_(v) + else: + group.attrs[k] = v + elif v.dtype.type == np.str_: + itemsize = v.dtype.itemsize // 4 # every character is 4 bytes + dset = group.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=compression) + dset.attrs['strtype'] = b'unicode' + dset.attrs['itemsize'] = itemsize + else: + group.create_dataset(k, data=v, compression=compression) + + @classmethod + def write_arrays(cls, filename: str, arrays: OrderedDict, attribute_types: dict, + dataset_types: dict, unshared_fields: List[str], compression: str): + """ + Write arrays to file while checking all data fields. + + Parameters + ---------- + filename: str + Name of the file to write to. + arrays: OrderedDict + Dictionary containing array-formatted fields to write to file. + attribute_types: dict + Dictionary with the required types for the attributes in the file. + dataset_types: dict + Dictionary with the require dtypes for the numpy arrays in the + file. + unshared_fields: List[str] + List of fields that are not shared between the records and + therefore should be an array with first dimension = number of + records + compression: str + Type of compression to use for the HDF5 file. + """ + with h5py.File(filename, 'a') as f: + for k, v in arrays.items(): + if k in attribute_types: + if isinstance(v, str): + f.attrs[k] = np.bytes_(v) + else: + f.attrs[k] = v + elif v.dtype.type == np.str_: + itemsize = v.dtype.itemsize // 4 # every character is 4 bytes + dset = f.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=compression) + dset.attrs['strtype'] = b'unicode' + dset.attrs['itemsize'] = itemsize + else: + f.create_dataset(k, data=v, compression=compression) + # STATIC METHODS COMMON ACROSS FORMATS # i.e. common methods that can be used by multiple formats in restructuring # (generally these will be used in the unshared fields dims for arrays) diff --git a/pydarnio/borealis/borealis_array.py b/pydarnio/borealis/borealis_array.py index 4ff6afd..8dbcf9c 100644 --- a/pydarnio/borealis/borealis_array.py +++ b/pydarnio/borealis/borealis_array.py @@ -37,8 +37,9 @@ For more information on Borealis data files and how they convert to SDarn files, see: https://borealis.readthedocs.io/en/latest/ """ -import deepdish as dd +import h5py import logging +import numpy as np from typing import List @@ -115,10 +116,10 @@ def __init__(self, filename: str, borealis_filetype: str): # get the version of the file - split by the dash, first part should be # 'vX.X' try: - version = dd.io.load(self.filename, - group='/borealis_git_hash').split('-')[0] - version = '.'.join(version.split('.')[:2]) # vX.Y, ignore patch revision - except ValueError as err: + with h5py.File(self.filename, 'r') as f: + full_version = f.attrs['borealis_git_hash'].decode('utf-8').split('-')[0] + version = '.'.join(full_version.split('.')[:2]) # vX.Y, ignore patch revision + except KeyError as err: raise borealis_exceptions.BorealisStructureError( ' {} Could not find the borealis_git_hash required to ' 'determine read version (file may be site style) {}' @@ -242,49 +243,14 @@ def read_file(self) -> dict: dataset_types = self.format.array_array_dtypes() unshared_fields = self.format.unshared_fields() - self._read_borealis_arrays(attribute_types, dataset_types, - unshared_fields) - return self._arrays - - def _read_borealis_arrays(self, attribute_types: dict, - dataset_types: dict, - unshared_fields: List[str]): - """ - Read the entire file while checking all data fields. - - Parameters - ---------- - attribute_types: dict - Dictionary with the required types for the attributes in the file. - dataset_types: dict - Dictionary with the require dtypes for the numpy arrays in the - file. - unshared_fields: List[str] - List of fields that are not shared between the records and - therefore should be an array with first dimension = number of - records - - Raises - ------ - BorealisFieldMissingError - when a field is missing from the Borealis - file - BorealisExtraFieldError - when an extra field is present in the - Borealis file - BorealisDataFormatTypeError - when a field has the incorrect - field type for the Borealis file - BorealisNumberOfRecordsError - when the number of records cannot - be discerned from the arrays - - See Also - -------- - BorealisUtilities - """ - arrays = dd.io.load(self.filename) + arrays = self.format.read_arrays(self.filename) BorealisUtilities.check_arrays(self.filename, arrays, attribute_types, dataset_types, unshared_fields) self._arrays = arrays + return self._arrays + class BorealisArrayWrite(): """ @@ -465,7 +431,14 @@ def write_file(self) -> str: Raises ------ - BorealisFileTypeError + BorealisFieldMissingError - when a field is missing from the Borealis + file + BorealisExtraFieldError - when an extra field is present in the + Borealis file + BorealisDataFormatTypeError - when a field has the incorrect + field type for the Borealis file + BorealisNumberOfRecordsError - when the number of records cannot + be discerned from the arrays See Also -------- @@ -479,45 +452,8 @@ def write_file(self) -> str: attribute_types = self.format.array_single_element_types() dataset_types = self.format.array_array_dtypes() unshared_fields = self.format.unshared_fields() - - self._write_borealis_arrays(attribute_types, dataset_types, - unshared_fields) + BorealisUtilities.check_arrays(self.filename, self.arrays, attribute_types, + dataset_types, unshared_fields) + self.format.write_arrays(self.filename, self.arrays, attribute_types, + dataset_types, unshared_fields, self.compression) return self.filename - - def _write_borealis_arrays(self, attribute_types: dict, - dataset_types: dict, - unshared_fields: List[str]): - """ - Write the entire file while checking all data fields. - - Parameters - ---------- - attribute_types: dict - Dictionary with the required types for the attributes in the file. - dataset_types: dict - Dictionary with the require dtypes for the numpy arrays in the - file. - unshared_fields: List[str] - List of fields that are not shared between the records and - therefore should be an array with first dimension = number of - records - - Raises - ------ - BorealisFieldMissingError - when a field is missing from the Borealis - file - BorealisExtraFieldError - when an extra field is present in the - Borealis file - BorealisDataFormatTypeError - when a field has the incorrect - field type for the Borealis file - BorealisNumberOfRecordsError - when the number of records cannot - be discerned from the arrays - - See Also - -------- - BorealisUtilities - """ - BorealisUtilities.check_arrays(self.filename, self.arrays, - attribute_types, dataset_types, - unshared_fields) - dd.io.save(self.filename, self.arrays, compression=self.compression) diff --git a/pydarnio/borealis/borealis_convert.py b/pydarnio/borealis/borealis_convert.py index 785b129..3e8eda3 100644 --- a/pydarnio/borealis/borealis_convert.py +++ b/pydarnio/borealis/borealis_convert.py @@ -42,7 +42,6 @@ """ import logging import numpy as np -import deepdish as dd from datetime import datetime from typing import Union diff --git a/pydarnio/borealis/borealis_formats.py b/pydarnio/borealis/borealis_formats.py index a247a85..e664756 100644 --- a/pydarnio/borealis/borealis_formats.py +++ b/pydarnio/borealis/borealis_formats.py @@ -262,19 +262,19 @@ def single_element_types(cls): return { # Identifies the version of Borealis that made this data. Necessary # for all versions. - "borealis_git_hash": np.unicode_, + "borealis_git_hash": str, # Number used to identify experiment. "experiment_id": np.int64, # Name of the experiment file. - "experiment_name": np.unicode_, + "experiment_name": str, # Comment about the whole experiment - "experiment_comment": np.unicode_, + "experiment_comment": str, # Additional text comment that describes the slice. - "slice_comment": np.unicode_, + "slice_comment": str, # Number of slices in the experiment at this integration time. "num_slices": np.int64, # Three letter radar identifier. - "station": np.unicode_, + "station": str, # Number of sampling periods in the integration time. "num_sequences": np.int64, # range gate separation (equivalent distance between samples), km. @@ -286,7 +286,7 @@ def single_element_types(cls): # Sampling rate of the samples being written to file in Hz. "rx_sample_rate": np.float64, # Designates if the record is the first in a scan. - "scan_start_marker": np.bool_, + "scan_start_marker": np.uint8, # Integration time in seconds. "int_time": np.float32, # Length of the pulse in microseconds. @@ -302,7 +302,7 @@ def single_element_types(cls): "freq": np.uint32, # str denoting C data type of the samples included in the data # array, such as 'complex float'. - "samples_data_type": np.unicode_, + "samples_data_type": str, # data normalization factor determined by the filter scaling in the # decimation scheme. "data_normalization_factor": np.float64, @@ -672,25 +672,25 @@ def single_element_types(cls): return { # Identifies the version of Borealis that made this data. Necessary # for all versions. - "borealis_git_hash": np.unicode_, + "borealis_git_hash": str, # Number used to identify experiment. "experiment_id": np.int64, # Name of the experiment file. - "experiment_name": np.unicode_, + "experiment_name": str, # Comment about the whole experiment - "experiment_comment": np.unicode_, + "experiment_comment": str, # Additional text comment that describes the slice. - "slice_comment": np.unicode_, + "slice_comment": str, # Number of slices in the experiment at this integration time. "num_slices": np.int64, # Three letter radar identifier. - "station": np.unicode_, + "station": str, # Number of sampling periods in the integration time. "num_sequences": np.int64, # Sampling rate of the samples being written to file in Hz. "rx_sample_rate": np.float64, # Designates if the record is the first in a scan. - "scan_start_marker": np.bool_, + "scan_start_marker": np.uint8, # Integration time in seconds. "int_time": np.float32, # Length of the pulse in microseconds. @@ -706,7 +706,7 @@ def single_element_types(cls): "freq": np.uint32, # str denoting C data type of the samples included in the data # array, such as 'complex float'. - "samples_data_type": np.unicode_, + "samples_data_type": str, # Number of samples in the sampling period. "num_samps": np.uint32, # range gate separation (equivalent distance between samples), km @@ -1074,25 +1074,25 @@ def single_element_types(cls): return { # Identifies the version of Borealis that made this data. Necessary # for all versions. - "borealis_git_hash": np.unicode_, + "borealis_git_hash": str, # Number used to identify experiment. "experiment_id": np.int64, # Name of the experiment file. - "experiment_name": np.unicode_, + "experiment_name": str, # Comment about the whole experiment - "experiment_comment": np.unicode_, + "experiment_comment": str, # Additional text comment that describes the slice. - "slice_comment": np.unicode_, + "slice_comment": str, # Number of slices in the experiment at this integration time. "num_slices": np.int64, # Three letter radar identifier. - "station": np.unicode_, + "station": str, # Number of sampling periods in the integration time. "num_sequences": np.int64, # Sampling rate of the samples being written to file in Hz. "rx_sample_rate": np.float64, # Designates if the record is the first in a scan. - "scan_start_marker": np.bool_, + "scan_start_marker": np.uint8, # Integration time in seconds. "int_time": np.float32, # Length of the pulse in microseconds. @@ -1108,7 +1108,7 @@ def single_element_types(cls): "freq": np.uint32, # str denoting C data type of the samples included in the data # array, such as 'complex float'. - "samples_data_type": np.unicode_, + "samples_data_type": str, # Number of samples in the sampling period. "num_samps": np.uint32, # data normalization factor determined by the filter scaling in the @@ -1402,23 +1402,23 @@ def single_element_types(cls): return { # Identifies the version of Borealis that made this data. Necessary # for all versions. - "borealis_git_hash": np.unicode_, + "borealis_git_hash": str, # Number used to identify experiment. "experiment_id": np.int64, # Name of the experiment file. - "experiment_name": np.unicode_, + "experiment_name": str, # Comment about the whole experiment - "experiment_comment": np.unicode_, + "experiment_comment": str, # Number of slices in the experiment at this integration time. "num_slices": np.int64, # Three letter radar identifier. - "station": np.unicode_, + "station": str, # Number of sampling periods in the integration time. "num_sequences": np.int64, # Sampling rate of the samples being written to file in Hz. "rx_sample_rate": np.float64, # Designates if the record is the first in a scan. - "scan_start_marker": np.bool_, + "scan_start_marker": np.uint8, # Integration time in seconds. "int_time": np.float32, # Number of main array antennas. @@ -1427,7 +1427,7 @@ def single_element_types(cls): "intf_antenna_count": np.uint32, # str denoting C data type of the samples included in the data # array, such as 'complex float'. - "samples_data_type": np.unicode_, + "samples_data_type": str, # The center frequency of this data in kHz "rx_center_freq": np.float64, # Number of samples in the sampling period. @@ -1513,12 +1513,12 @@ def single_element_types(cls): # the slice id of the file and dataset. "slice_id": np.uint32, # the interfacing of this slice to other slices. - "slice_interfacing": np.unicode_, + "slice_interfacing": str, # A string describing the type of scheduling time at the time of # this dataset. - "scheduling_mode": np.unicode_, + "scheduling_mode": str, # A string describing the averaging method, ex. mean, median - "averaging_method": np.unicode_, + "averaging_method": str, # number of blanked samples in the sequence. "num_blanked_samples": np.uint32 }) @@ -1663,10 +1663,10 @@ def single_element_types(cls): # the slice id of the file and dataset. "slice_id": np.uint32, # the interfacing of this slice to other slices. - "slice_interfacing": np.unicode_, + "slice_interfacing": str, # A string describing the type of scheduling time at the time of # this dataset. - "scheduling_mode": np.unicode_, + "scheduling_mode": str, # number of blanked samples in the sequence. "num_blanked_samples": np.uint32 }) @@ -1806,10 +1806,10 @@ def single_element_types(cls): # the slice id of the file and dataset. "slice_id": np.uint32, # the interfacing of this slice to other slices. - "slice_interfacing": np.unicode_, + "slice_interfacing": str, # A string describing the type of scheduling time at the time of # this dataset. - "scheduling_mode": np.unicode_, + "scheduling_mode": str, # number of blanked samples in the sequence. "num_blanked_samples": np.uint32 }) @@ -1955,7 +1955,7 @@ def single_element_types(cls): single_element_types.update({ # A string describing the type of scheduling time at the time of # this dataset. - "scheduling_mode": np.unicode_ + "scheduling_mode": str }) return single_element_types @@ -2044,7 +2044,7 @@ def single_element_types(cls): "lp_status_word": np.uint32, # Boolean indicating if the GPS was locked during the entire # integration period - "gps_locked": np.bool_, + "gps_locked": np.uint8, # The max time diffe between GPS and system time during the # integration period. In seconds. Negative if GPS time ahead. "gps_to_system_time_diff": np.float64, @@ -2152,7 +2152,7 @@ def single_element_types(cls): "lp_status_word": np.uint32, # Boolean indicating if the GPS was locked during the entire # integration period - "gps_locked": np.bool_, + "gps_locked": np.uint8, # The max time diffe between GPS and system time during the # integration period. In seconds. Negative if GPS time ahead. "gps_to_system_time_diff": np.float64, @@ -2267,7 +2267,7 @@ def single_element_types(cls): "lp_status_word": np.uint32, # Boolean indicating if the GPS was locked during the entire # integration period - "gps_locked": np.bool_, + "gps_locked": np.uint8, # The max time diffe between GPS and system time during the # integration period. In seconds. Negative if GPS time ahead. "gps_to_system_time_diff": np.float64, @@ -2374,7 +2374,7 @@ def single_element_types(cls): "lp_status_word": np.uint32, # Boolean indicating if the GPS was locked during the entire # integration period - "gps_locked": np.bool_, + "gps_locked": np.uint8, # The max time diffe between GPS and system time during the # integration period. In seconds. Negative if GPS time ahead. "gps_to_system_time_diff": np.float64, diff --git a/pydarnio/borealis/borealis_restructure.py b/pydarnio/borealis/borealis_restructure.py index 228d608..536ea61 100755 --- a/pydarnio/borealis/borealis_restructure.py +++ b/pydarnio/borealis/borealis_restructure.py @@ -34,9 +34,7 @@ import os import subprocess as sp import warnings -from pathlib import Path import h5py -import deepdish as dd import logging import numpy as np from datetime import datetime @@ -208,57 +206,64 @@ def _array_to_site_restructure(self): attribute_types = self.format.site_single_element_types() dataset_types = self.format.array_dtypes() try: - shared_fields_dict = dict() - # shared fields are common across records, so this is done once - for field in self.format.shared_fields(): - field_data = dd.io.load(self.infile_name, '/{}'.format(field)) - shared_fields_dict[field] = field_data - - unshared_single_elements = dict() - # These are fields which have one element per record, so the - # arrays are small enough to be loaded completely into memory - for field in self.format.unshared_fields(): - if field in self.format.single_element_types(): - unshared_single_elements[field] = dd.io.load( - self.infile_name, '/{}'.format(field)) - - sqn_timestamps_array = dd.io.load(self.infile_name, - '/sqn_timestamps') - for record_num, seq_timestamp in enumerate(sqn_timestamps_array): - # format dictionary key in the same way it is done - # in datawrite on site - seq_datetime = datetime.utcfromtimestamp(seq_timestamp[0]) - epoch = datetime.utcfromtimestamp(0) - key = str(int((seq_datetime - epoch).total_seconds() * 1000)) - - # Make this fresh every time, to reduce memory footprint - record_dict = dict() - - # Copy over the shared fields - for k, v in shared_fields_dict.items(): - record_dict[k] = v - - # populate site specific fields using given functions - # that take both the arrays data and the record number - with h5py.File(self.infile_name, 'r') as f: + with h5py.File(self.infile_name, 'r') as f: + + # shared fields are common across records, so this is done once + shared_fields_dict = dict() + for field in self.format.shared_fields(): + if field in attribute_types: + data = f.attrs[field] + if isinstance(data, bytes): + data = data.decode('utf-8') + elif field in self.format.array_string_fields(): + dset = f[field] + itemsize = dset.attrs['itemsize'] + data = dset[:].view(dtype=(np.unicode_, itemsize)) + else: + data = f[field][:] + shared_fields_dict[field] = data + + # These are fields which have one element per record, so the + # arrays are small enough to be loaded completely into memory + unshared_single_elements = dict() + for field in self.format.unshared_fields(): + if field in self.format.single_element_types(): + unshared_single_elements[field] = f[field][:] + + sqn_timestamps_array = f['sqn_timestamps'][:] + + for record_num, seq_timestamp in enumerate(sqn_timestamps_array): + # format dictionary key in the same way it is done + # in datawrite on site + seq_datetime = datetime.utcfromtimestamp(seq_timestamp[0]) + epoch = datetime.utcfromtimestamp(0) + key = str(int((seq_datetime - epoch).total_seconds() * 1000)) + + # Make this fresh every time, to reduce memory footprint + record_dict = dict() + + # Copy over the shared fields + for k, v in shared_fields_dict.items(): + record_dict[k] = v + + # populate site specific fields using given functions + # that take both the arrays data and the record number for field in self.format.site_specific_fields(): record_dict[field] = \ self.format.site_specific_fields_generate( )[field](f, record_num) - for field in self.format.unshared_fields(): - if field in self.format.single_element_types(): - datatype = self.format.single_element_types()[field] - # field is not an array, single element per record. - # unshared_field_dims_site should give empty list. - record_dict[field] = \ - datatype(unshared_single_elements[field][ - record_num]) - else: # field in array_dtypes - # need to get the dims correct, - # not always equal to the max - field_flag = False - with h5py.File(self.infile_name, 'r') as f: + for field in self.format.unshared_fields(): + if field in self.format.single_element_types(): + datatype = self.format.single_element_types()[field] + # field is not an array, single element per record. + # unshared_field_dims_site should give empty list. + record_dict[field] = \ + datatype(unshared_single_elements[field][ + record_num]) + else: # field in array_dtypes + # need to get the dims correct, not always equal to the max + field_flag = False site_dims = [dimension_function(f, record_num) for dimension_function in self.format.unshared_fields_dims_site( @@ -277,18 +282,16 @@ def _array_to_site_restructure(self): index_slice = [slice(0, i) for i in site_dims if i != -1] index_slice.insert(0, record_num) index_slice = tuple(index_slice) - # If there was an incorrect dimension (-1 in dims), then use deepdish to extract the field - if field_flag: - record_dict[field] = dd.io.load(self.infile_name, f'/{field}')[index_slice] - else: - record_dict[field] = f[field][index_slice] - # Wrap in another dict to use the format method - record_dict = OrderedDict({key: record_dict}) - record_dict = self.format.flatten_site_arrays(record_dict) - - # Write the single record to file - self._write_borealis_record(record_dict, key, attribute_types, - dataset_types) + record_dict[field] = f[field][index_slice] + + # Wrap in another dict to use the format method + record_dict = OrderedDict({key: record_dict}) + record_dict = self.format.flatten_site_arrays(record_dict) + BorealisUtilities.check_records(self.infile_name, record_dict, attribute_types, dataset_types) + + # Write the single record to file + self.format.write_records(self.outfile_name, record_dict, attribute_types, dataset_types, + self.compression) except Exception as err: raise borealis_exceptions.BorealisRestructureError( 'Records for {}: Error restructuring {} from array to site ' @@ -333,7 +336,7 @@ def _site_to_array_restructure(self): rec_dict.update({k: record.attrs[k] for k in rec_attrs}) # Bitwise fields also need to be handled separately for field in self.format.bool_types(): - rec_dict[field] = dd.io.load(self.infile_name, f'/{record_name}/{field}') + rec_dict[field] = f[record_name][field] # some fields are linear in site style and need to be reshaped. # Pass in record nested in a dictionary, as @@ -359,8 +362,9 @@ def _site_to_array_restructure(self): else: raise TypeError(f'Field {field} has unrecognized data: {value}') elif field in self.format.array_string_fields(): - # h5py reads numpy string arrays as contiguous unsigned ints, so we need deepdish here - new_data_dict[field] = dd.io.load(self.infile_name, f'/{record_name}/{field}') + dset = f[record_name][field] + itemsize = dset.attrs['itemsize'] + new_data_dict[field] = dset[:].view(dtype=(np.unicode_, itemsize)) else: raise TypeError(f'Field {field} unrecognized') @@ -374,7 +378,7 @@ def _site_to_array_restructure(self): # Initialize array now with correct data type. dtype = self.format.single_element_types()[field] new_data_dict[field] = np.empty(num_records, dtype=dtype) - if dtype is np.int64 or dtype is np.uint32: + if dtype in [np.int64, np.uint32, np.uint8]: new_data_dict[field][:] = -1 else: new_data_dict[field][:] = np.NaN @@ -396,7 +400,7 @@ def _site_to_array_restructure(self): datatype = self.format.single_element_types()[field] else: # field in array_dtypes datatype = self.format.array_dtypes()[field] - if datatype == np.unicode_: + if datatype == str: # unicode type needs to be explicitly set to # have multiple chars (256) datatype = '|U256' @@ -406,7 +410,7 @@ def _site_to_array_restructure(self): # change between records), so they are initialized # with a known value first. Initialize floating- # point values to NaN, and integer values to -1. - if datatype is np.int64 or datatype is np.uint32: + if datatype in [np.int64, np.uint32, np.uint8]: empty_array[:] = -1 else: empty_array[:] = np.NaN @@ -433,65 +437,13 @@ def _site_to_array_restructure(self): attribute_types = self.format.array_single_element_types() dataset_types = self.format.array_array_dtypes() unshared_fields = self.format.unshared_fields() - BorealisUtilities.check_arrays(self.infile_name, new_data_dict, - attribute_types, dataset_types, + BorealisUtilities.check_arrays(self.infile_name, new_data_dict, attribute_types, dataset_types, unshared_fields) - dd.io.save(self.outfile_name, new_data_dict, - compression=self.compression) + self.format.write_arrays(self.outfile_name, new_data_dict, attribute_types, dataset_types, unshared_fields, + self.compression) except TypeError as err: raise borealis_exceptions.BorealisRestructureError( 'Records for {}: Error restructuring {} from site to array ' 'style: {}'.format(self.infile_name, self.format.__name__, err) ) from err - - def _write_borealis_record(self, record: dict, record_name: str, - attribute_types: dict, dataset_types: dict): - """ - Add a record to the output file in site style after checking the record. - - Several Borealis field checks are done to insure the integrity of the - record. - - Parameters - ---------- - record: dict - Dictionary containing the site-structured record. - record_name: str - Group name of the record for the HDF5 hierarchy. - attribute_types: dict - Dictionary with the required types for the attributes in the file. - dataset_types: dict - Dictionary with the required dtypes for the numpy arrays in the - file. - - Raises - ------ - BorealisFieldMissingError - BorealisExtraFieldError - BorealisDataFormatTypeError - - See Also - -------- - BorealisUtilities - """ - Path(self.outfile_name).touch() - BorealisUtilities.check_records(self.infile_name, record, - attribute_types, dataset_types) - - # use external h5copy utility to move new record into 2hr file. - - warnings.filterwarnings("ignore") - # Must use temporary file to append to a file; writing entire - # dictionary at once also doesn't work so this is required. - tmp_filename = self.outfile_name + '.tmp' - Path(tmp_filename).touch() - - dd.io.save(tmp_filename, record[record_name], - compression=self.compression) - f = dd.io.load(tmp_filename, '/') - cp_cmd = 'h5copy -i {newfile} -o {full_file} -s / -d {dtstr}' - cmd = cp_cmd.format(newfile=tmp_filename, full_file=self.outfile_name, - dtstr=record_name) - sp.run(cmd.split()) - os.remove(tmp_filename) diff --git a/pydarnio/borealis/borealis_site.py b/pydarnio/borealis/borealis_site.py index 0c5b2b3..473ae4a 100644 --- a/pydarnio/borealis/borealis_site.py +++ b/pydarnio/borealis/borealis_site.py @@ -35,15 +35,14 @@ Add compression to bzip2 """ -import deepdish as dd import h5py import logging import os import subprocess as sp import warnings +import numpy as np from collections import OrderedDict -from pathlib2 import Path from typing import Union from pydarnio import borealis_exceptions, borealis_formats @@ -125,11 +124,11 @@ def __init__(self, filename: str, borealis_filetype: str): # 'vX.X' try: - version = dd.io.load(self.filename, - group='/'+self._record_names[0] - )['borealis_git_hash'].split('-')[0] - version = '.'.join(version.split('.')[:2]) # vX.Y, ignore patch revision - except (IndexError, ValueError) as err: + with h5py.File(self.filename, 'r') as f: + first_rec = f[self._record_names[0]] + full_version = first_rec.attrs['borealis_git_hash'].decode('utf-8').split('-')[0] + version = '.'.join(full_version.split('.')[:2]) # vX.Y, ignore patch revision + except (IndexError, KeyError) as err: # if this is an array style file, it will raise # IndexError on the array. raise borealis_exceptions.BorealisStructureError( @@ -247,36 +246,9 @@ def read_file(self) -> dict: records: OrderedDict{dict} records of Borealis rawacf data. Keys are first sequence timestamp (in ms since epoch). - """ - pyDARNio_log.info("Reading Borealis {} {} file: {}" - "".format(self.software_version, - self.borealis_filetype, self.filename)) - - attribute_types = self.format.site_single_element_types() - dataset_types = self.format.site_array_dtypes() - - self._read_borealis_records(attribute_types, dataset_types) - return self._records - - def _read_borealis_records(self, attribute_types: dict, - dataset_types: dict): - """ - Read the entire file while checking all data fields. - - Several Borealis field checks are done to insure the integrity of the - file. - - Parameters - ---------- - attribute_types: dict - Dictionary with the required types for the attributes in the file. - dataset_types: dict - Dictionary with the require dtypes for the numpy arrays in the - file. Raises ------ - OSError: file does not exist BorealisFieldMissingError - when a field is missing from the Borealis file/stream type BorealisExtraFieldError - when an extra field is present in the @@ -288,11 +260,19 @@ def _read_borealis_records(self, attribute_types: dict, -------- BorealisUtilities """ - records = dd.io.load(self.filename) + pyDARNio_log.info("Reading Borealis {} {} file: {}" + "".format(self.software_version, + self.borealis_filetype, self.filename)) + + attribute_types = self.format.site_single_element_types() + dataset_types = self.format.site_array_dtypes() + + records = self.format.read_records(self.filename) BorealisUtilities.check_records(self.filename, records, attribute_types, dataset_types) self._records = OrderedDict(sorted(records.items())) + return self._records class BorealisSiteWrite(): @@ -487,34 +467,8 @@ def write_file(self) -> str: Returns ------- - filename + filename: str The filename written to. - """ - pyDARNio_log.info("Writing Borealis {} {} file: {}" - "".format(self.software_version, - self.borealis_filetype, self.filename)) - - attribute_types = self.format.site_single_element_types() - dataset_types = self.format.site_array_dtypes() - - self._write_borealis_records(attribute_types, dataset_types) - return self.filename - - def _write_borealis_records(self, attribute_types: dict, - dataset_types: dict): - """ - Write the file in site style after checking records. - - Several Borealis field checks are done to insure the integrity of the - file. - - Parameters - ---------- - attributes_type_dict: dict - Dictionary with the required types for the attributes in the file. - datasets_type_dict: dict - Dictionary with the require dtypes for the numpy arrays in the - file. Raises ------ @@ -524,27 +478,15 @@ def _write_borealis_records(self, attribute_types: dict, Borealis file/stream type BorealisDataFormatTypeError - when a field has the incorrect field type for the Borealis file/stream type - - See Also - -------- - BorealisUtilities """ - Path(self.filename).touch() + pyDARNio_log.info("Writing Borealis {} {} file: {}" + "".format(self.software_version, + self.borealis_filetype, self.filename)) + + attribute_types = self.format.site_single_element_types() + dataset_types = self.format.site_array_dtypes() BorealisUtilities.check_records(self.filename, self.records, attribute_types, dataset_types) - - # use external h5copy utility to move new record into 2hr file. - - warnings.filterwarnings("ignore") - # Must use temporary file to append to a file; writing entire - # dictionary at once also doesn't work so this is required. - tmp_filename = self.filename + '.tmp' - Path(tmp_filename).touch() - for group_name, group_dict in self.records.items(): - dd.io.save(tmp_filename, {str(group_name): group_dict}, - compression=self.compression) - cp_cmd = 'h5copy -i {newfile} -o {full_file} -s {dtstr} -d {dtstr}' - cmd = cp_cmd.format(newfile=tmp_filename, full_file=self.filename, - dtstr='/'+str(group_name)) - sp.call(cmd.split()) - os.remove(tmp_filename) + self.format.write_records(self.filename, self.records, attribute_types, + dataset_types, self.compression) + return self.filename diff --git a/pydarnio/borealis/borealis_utilities.py b/pydarnio/borealis/borealis_utilities.py index 10929c2..bc7e8fc 100644 --- a/pydarnio/borealis/borealis_utilities.py +++ b/pydarnio/borealis/borealis_utilities.py @@ -28,7 +28,6 @@ """ import logging -import deepdish as dd import h5py import numpy as np import sys @@ -263,7 +262,8 @@ def record_incorrect_types_check(filename: str, attributes_type_dict: dict, incorrect_types_check = {param: str(attributes_type_dict[param]) for param in attributes_type_dict.keys() if type(record[param]) != - attributes_type_dict[param]} + attributes_type_dict[param] and + record[param].shape is not None} incorrect_types_check.update({param: 'np.ndarray of ' + str(datasets_type_dict[param]) @@ -322,7 +322,8 @@ def array_incorrect_types_check(filename: str, attributes_type_dict: dict, incorrect_types_check = {param: str(attributes_type_dict[param]) for param in attributes_type_dict.keys() if type(file_data[param]) != - attributes_type_dict[param]} + attributes_type_dict[param] and + file_data[param].shape is not None} datasets_type_dict_keys = sorted(list(datasets_type_dict.keys())) np_array_types = [isinstance(file_data[param], np.ndarray) for param in @@ -343,7 +344,8 @@ def array_incorrect_types_check(filename: str, attributes_type_dict: dict, str(datasets_type_dict[param]) for param in datasets_type_dict.keys() if file_data[param].dtype.type != - datasets_type_dict[param]}) + datasets_type_dict[param] and + file_data[param].dtype.type != np.str_}) if len(incorrect_types_check) > 0: raise borealis_exceptions.\ BorealisDataFormatTypeError(filename, @@ -560,19 +562,21 @@ def get_borealis_version(filename: str, record_names, structure: str): """ if structure == 'array': try: - borealis_git_hash = dd.io.load(filename, - group='/borealis_git_hash') - except ValueError as err: + with h5py.File(filename, 'r') as f: + borealis_git_hash = f.attrs['borealis_git_hash'].decode('utf-8') + except KeyError as err: raise borealis_exceptions.BorealisStructureError( ' {} Could not find the borealis_git_hash required to ' 'determine file version. Data file may be corrupted. {}' ''.format(filename, err)) from err elif structure == 'site': try: - borealis_git_hash = \ - dd.io.load(filename, group='/{}/borealis_git_hash' - ''.format(record_names[0])) - except ValueError as err: + with h5py.File(filename, 'r') as f: + records = sorted(list(f.keys())) + first_rec = f[records[0]] + borealis_git_hash = first_rec.attrs['borealis_git_hash']\ + .decode('utf-8') + except KeyError as err: raise borealis_exceptions.BorealisStructureError( ' {} Could not find the borealis_git_hash required to ' 'determine file version. Data file may be corrupted. {}' diff --git a/setup.cfg b/setup.cfg index 0e674e9..68fcbcd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,7 +20,6 @@ install_requires = pyyaml numpy h5py>=3.3.0 - deepdish pathlib2 [options.packages.find] @@ -28,4 +27,4 @@ exclude = test* test_files* docs* - build* \ No newline at end of file + build* diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..7f1901d --- /dev/null +++ b/setup.py @@ -0,0 +1,56 @@ +""" +Copyright 2018 SuperDARN Canada, University of Saskatchewan + +setup.py +2018-11-05 +To setup pyDARNio as a third party library. Include installing need libraries for +running the files. + +author: +Marina Schmidt + +Disclaimer: +pyDARNio is under the LGPL v3 license found in the root directory LICENSE.md +Everyone is permitted to copy and distribute verbatim copies of this license +document, but changing it is not allowed. + +This version of the GNU Lesser General Public License incorporates the terms +and conditions of version 3 of the GNU General Public License, +supplemented by the additional permissions listed below. + +""" + +from os import path +from setuptools import setup, find_packages +import sys +from subprocess import check_call +from setuptools.command.install import install, orig + +this_directory = path.abspath(path.dirname(__file__)) +with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: + long_description = f.read() + + +# Setup information +setup( + name="pydarnio", + version="1.2.1", + long_description=long_description, + long_description_content_type='text/markdown', + description="Python library for reading and writing SuperDARN data", + url='https://github.com/SuperDARN/pyDARNio.git', + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7'], + python_requires='>=3.6', + packages=find_packages(exclude=['docs', 'test']), + author="SuperDARN", + include_package_data=True, + setup_requires=['pyyaml', 'numpy', + 'h5py>=3.3.0', 'pathlib2'], + # pyyaml library install + install_requires=['pyyaml', 'numpy', + 'h5py>=3.3.0', 'deepdish', 'pathlib2'] +)