From 0790ddb9f88cc87d87bc4af6fb6ffd0fe38c81d6 Mon Sep 17 00:00:00 2001 From: Remington Rohel Date: Fri, 17 Mar 2023 18:15:39 +0000 Subject: [PATCH 01/18] First crack at switching to h5py for reading in Borealis site files. * Haven't added any code for dealing with data_descriptors or correlation_descriptors fields (they are finnicky) * Updated h5py dependency to need >= v3.3.0 * Have not tested whatsoever --- pydarnio/borealis/base_format.py | 53 ++++++++++++++++++++++++++++++ pydarnio/borealis/borealis_site.py | 46 ++++++++------------------ setup.py | 4 +-- 3 files changed, 69 insertions(+), 34 deletions(-) diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py index ca8e97d..297c943 100644 --- a/pydarnio/borealis/base_format.py +++ b/pydarnio/borealis/base_format.py @@ -1229,6 +1229,59 @@ class methods used inside this method should be specific return timestamp_dict + @classmethod + def _read_borealis_records(cls, filename: str) -> OrderedDict: + """ + Base function for reading in a Borealis site file. + + Parameters + ---------- + filename: str + Name of the file to load records from + + Returns + ------- + OrderedDict + a dict of timestamped records loaded from an hdf5 Borealis site file + + Raises + ------ + OSError: file does not exist + + Notes + ----- + The results will differ based on the format class, as many of the + class methods used inside this method should be specific + to the format and updated in the child class. + """ + records = OrderedDict() + with h5py.File(filename, 'r') as f: + record_keys = sorted(list(f.keys())) + for rec_key in record_keys: + rec_dict = {} + group = f[rec_key] + + # Get the datasets (vector fields) + datasets = list(group.keys()) + for dset_name in datasets: + dset = group[dset_name][:] + # TODO: Handle data_descriptors, correlation_descriptors fields (they are gross) + rec_dict[dset_name] = dset + + # Get the attributes (scalar fields) + attribute_dict = {k: v for k, v in group.attrs.items()} + attribute_dict.pop('CLASS') # Inherent to HDF5 file + attribute_dict.pop('TITLE') # Inherent to HDF5 file + attribute_dict.pop('VERSION') # Inherent to HDF5 file + for k, v in attribute_dict.items(): + if isinstance(v, bytes): + attribute_dict[k] = v.tobytes().decode('utf-8') + rec_dict.update(attribute_dict) + + records[rec_key] = rec_dict + + return records + # STATIC METHODS COMMON ACROSS FORMATS # i.e. common methods that can be used by multiple formats in restructuring # (generally these will be used in the unshared fields dims for arrays) diff --git a/pydarnio/borealis/borealis_site.py b/pydarnio/borealis/borealis_site.py index 0c5b2b3..b20d02d 100644 --- a/pydarnio/borealis/borealis_site.py +++ b/pydarnio/borealis/borealis_site.py @@ -125,10 +125,11 @@ def __init__(self, filename: str, borealis_filetype: str): # 'vX.X' try: - version = dd.io.load(self.filename, - group='/'+self._record_names[0] - )['borealis_git_hash'].split('-')[0] - version = '.'.join(version.split('.')[:2]) # vX.Y, ignore patch revision + with h5py.File(self.filename, 'r') as f: + records = sorted(list(f.keys())) + first_rec = f[records[0]] + full_version = first_rec.attrs['borealis_git_hash'].decode('utf-8').split('-')[0] + version = '.'.join(version.split('.')[:2]) # vX.Y, ignore patch revision except (IndexError, ValueError) as err: # if this is an array style file, it will raise # IndexError on the array. @@ -247,36 +248,9 @@ def read_file(self) -> dict: records: OrderedDict{dict} records of Borealis rawacf data. Keys are first sequence timestamp (in ms since epoch). - """ - pyDARNio_log.info("Reading Borealis {} {} file: {}" - "".format(self.software_version, - self.borealis_filetype, self.filename)) - - attribute_types = self.format.site_single_element_types() - dataset_types = self.format.site_array_dtypes() - - self._read_borealis_records(attribute_types, dataset_types) - return self._records - - def _read_borealis_records(self, attribute_types: dict, - dataset_types: dict): - """ - Read the entire file while checking all data fields. - - Several Borealis field checks are done to insure the integrity of the - file. - - Parameters - ---------- - attribute_types: dict - Dictionary with the required types for the attributes in the file. - dataset_types: dict - Dictionary with the require dtypes for the numpy arrays in the - file. Raises ------ - OSError: file does not exist BorealisFieldMissingError - when a field is missing from the Borealis file/stream type BorealisExtraFieldError - when an extra field is present in the @@ -288,11 +262,19 @@ def _read_borealis_records(self, attribute_types: dict, -------- BorealisUtilities """ - records = dd.io.load(self.filename) + pyDARNio_log.info("Reading Borealis {} {} file: {}" + "".format(self.software_version, + self.borealis_filetype, self.filename)) + + attribute_types = self.format.site_single_element_types() + dataset_types = self.format.site_array_dtypes() + + records = self.format._read_borealis_records(self.filename) BorealisUtilities.check_records(self.filename, records, attribute_types, dataset_types) self._records = OrderedDict(sorted(records.items())) + return self._records class BorealisSiteWrite(): diff --git a/setup.py b/setup.py index 7034341..1af436b 100644 --- a/setup.py +++ b/setup.py @@ -49,8 +49,8 @@ author="SuperDARN", include_package_data=True, setup_requires=['pyyaml', 'numpy', - 'h5py', 'deepdish', 'pathlib2'], + 'h5py>=3.3.0', 'deepdish', 'pathlib2'], # pyyaml library install install_requires=['pyyaml', 'numpy', - 'h5py', 'deepdish', 'pathlib2'] + 'h5py>=3.3.0', 'deepdish', 'pathlib2'] ) From 7c53e99eef0134a763ff29708e7b5fe43d303d75 Mon Sep 17 00:00:00 2001 From: Remington Rohel Date: Mon, 20 Mar 2023 15:34:25 +0000 Subject: [PATCH 02/18] Added logic for unpacking deepdish strings from HDF5 files. --- pydarnio/borealis/base_format.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py index 297c943..f1ec6c8 100644 --- a/pydarnio/borealis/base_format.py +++ b/pydarnio/borealis/base_format.py @@ -1264,9 +1264,13 @@ class methods used inside this method should be specific # Get the datasets (vector fields) datasets = list(group.keys()) for dset_name in datasets: - dset = group[dset_name][:] - # TODO: Handle data_descriptors, correlation_descriptors fields (they are gross) - rec_dict[dset_name] = dset + dset = group[dset_name] + if 'strtype' in dset.attrs: # string type, requires some handling + itemsize = dset.attrs['itemsize'] + data = dset[:].view(dtype=(np.unicode_, itemsize)) + else: + data = dset[:] # non-string, can simply load + rec_dict[dset_name] = data # Get the attributes (scalar fields) attribute_dict = {k: v for k, v in group.attrs.items()} From 0ef1553401dead4907522c40edfac534377daf29 Mon Sep 17 00:00:00 2001 From: Remington Rohel Date: Mon, 20 Mar 2023 15:54:19 +0000 Subject: [PATCH 03/18] Fixed a bug with reading in the version. --- pydarnio/borealis/borealis_site.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydarnio/borealis/borealis_site.py b/pydarnio/borealis/borealis_site.py index b20d02d..cc9faa8 100644 --- a/pydarnio/borealis/borealis_site.py +++ b/pydarnio/borealis/borealis_site.py @@ -129,7 +129,7 @@ def __init__(self, filename: str, borealis_filetype: str): records = sorted(list(f.keys())) first_rec = f[records[0]] full_version = first_rec.attrs['borealis_git_hash'].decode('utf-8').split('-')[0] - version = '.'.join(version.split('.')[:2]) # vX.Y, ignore patch revision + version = '.'.join(full_version.split('.')[:2]) # vX.Y, ignore patch revision except (IndexError, ValueError) as err: # if this is an array style file, it will raise # IndexError on the array. From c0df902c0962716e8104895963f9ba8f3c144519 Mon Sep 17 00:00:00 2001 From: carleyjmartin Date: Mon, 20 Mar 2023 16:20:35 -0600 Subject: [PATCH 04/18] all instances of dd changed to h5py, not tested --- .gitignore | 1 + pydarnio/borealis/borealis_array.py | 26 +++++++++------ pydarnio/borealis/borealis_convert.py | 1 - pydarnio/borealis/borealis_restructure.py | 39 ++++++++++++----------- pydarnio/borealis/borealis_site.py | 6 ++-- pydarnio/borealis/borealis_utilities.py | 15 +++++---- 6 files changed, 50 insertions(+), 38 deletions(-) diff --git a/.gitignore b/.gitignore index b6e4761..ed783b6 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ __pycache__/ *.py[cod] *$py.class +*.DS_Store # C extensions *.so diff --git a/pydarnio/borealis/borealis_array.py b/pydarnio/borealis/borealis_array.py index 4ff6afd..0875406 100644 --- a/pydarnio/borealis/borealis_array.py +++ b/pydarnio/borealis/borealis_array.py @@ -37,7 +37,7 @@ For more information on Borealis data files and how they convert to SDarn files, see: https://borealis.readthedocs.io/en/latest/ """ -import deepdish as dd +import h5py import logging from typing import List @@ -115,9 +115,11 @@ def __init__(self, filename: str, borealis_filetype: str): # get the version of the file - split by the dash, first part should be # 'vX.X' try: - version = dd.io.load(self.filename, - group='/borealis_git_hash').split('-')[0] - version = '.'.join(version.split('.')[:2]) # vX.Y, ignore patch revision + with h5py.File(self.filename, 'r') as f: + records = sorted(list(f.keys())) + first_rec = f[records[0]] + full_version = first_rec.attrs['borealis_git_hash'].decode('utf-8').split('-')[0] + version = '.'.join(full_version.split('.')[:2]) # vX.Y, ignore patch revision except ValueError as err: raise borealis_exceptions.BorealisStructureError( ' {} Could not find the borealis_git_hash required to ' @@ -279,11 +281,14 @@ def _read_borealis_arrays(self, attribute_types: dict, -------- BorealisUtilities """ - arrays = dd.io.load(self.filename) - BorealisUtilities.check_arrays(self.filename, arrays, - attribute_types, dataset_types, - unshared_fields) - self._arrays = arrays + attr_types = self.format.site_single_element_types() + dataset_types = self.format.site_array_dtypes() + records = self.format._read_borealis_records + while h5py.File(self.filename, 'r') as arrays: + BorealisUtilities.check_arrays(self.filename, arrays, + attribute_types, dataset_types, + unshared_fields) + self._arrays = arrays class BorealisArrayWrite(): @@ -520,4 +525,5 @@ def _write_borealis_arrays(self, attribute_types: dict, BorealisUtilities.check_arrays(self.filename, self.arrays, attribute_types, dataset_types, unshared_fields) - dd.io.save(self.filename, self.arrays, compression=self.compression) + with h5py.File(self.filename, 'w') as f: + f.create_dataset(self.arrays, compression=self.compression) diff --git a/pydarnio/borealis/borealis_convert.py b/pydarnio/borealis/borealis_convert.py index 785b129..3e8eda3 100644 --- a/pydarnio/borealis/borealis_convert.py +++ b/pydarnio/borealis/borealis_convert.py @@ -42,7 +42,6 @@ """ import logging import numpy as np -import deepdish as dd from datetime import datetime from typing import Union diff --git a/pydarnio/borealis/borealis_restructure.py b/pydarnio/borealis/borealis_restructure.py index 228d608..aed2938 100755 --- a/pydarnio/borealis/borealis_restructure.py +++ b/pydarnio/borealis/borealis_restructure.py @@ -36,7 +36,6 @@ import warnings from pathlib import Path import h5py -import deepdish as dd import logging import numpy as np from datetime import datetime @@ -210,20 +209,23 @@ def _array_to_site_restructure(self): try: shared_fields_dict = dict() # shared fields are common across records, so this is done once - for field in self.format.shared_fields(): - field_data = dd.io.load(self.infile_name, '/{}'.format(field)) - shared_fields_dict[field] = field_data + with hdf5.File(self.infile_name, 'r') as f: + for field in self.format.shared_fields(): + shared_fields_dict[field] = f[field] unshared_single_elements = dict() # These are fields which have one element per record, so the # arrays are small enough to be loaded completely into memory - for field in self.format.unshared_fields(): - if field in self.format.single_element_types(): - unshared_single_elements[field] = dd.io.load( - self.infile_name, '/{}'.format(field)) + with hdf5.File(self.infile_name, 'r') as f: + for field in self.format.unshared_fields(): + if field in self.format.single_element_types(): + unshared_single_elements[field] = f[field] - sqn_timestamps_array = dd.io.load(self.infile_name, - '/sqn_timestamps') + with h5py.File(self.infile_name, 'r') as f: + records = sorted(list(f.keys())) + first_rec = f[records[0]] + sqn_timestamps_array = first_rec.attrs['sqn_timestamps'] + .decode('utf-8') for record_num, seq_timestamp in enumerate(sqn_timestamps_array): # format dictionary key in the same way it is done # in datawrite on site @@ -279,7 +281,8 @@ def _array_to_site_restructure(self): index_slice = tuple(index_slice) # If there was an incorrect dimension (-1 in dims), then use deepdish to extract the field if field_flag: - record_dict[field] = dd.io.load(self.infile_name, f'/{field}')[index_slice] + with h5py.File(self.infile_name) as f: + record_dict[field] = f[field][index_slice] else: record_dict[field] = f[field][index_slice] # Wrap in another dict to use the format method @@ -333,7 +336,7 @@ def _site_to_array_restructure(self): rec_dict.update({k: record.attrs[k] for k in rec_attrs}) # Bitwise fields also need to be handled separately for field in self.format.bool_types(): - rec_dict[field] = dd.io.load(self.infile_name, f'/{record_name}/{field}') + rec_dict[field] = f[record_name][field] # some fields are linear in site style and need to be reshaped. # Pass in record nested in a dictionary, as @@ -360,7 +363,7 @@ def _site_to_array_restructure(self): raise TypeError(f'Field {field} has unrecognized data: {value}') elif field in self.format.array_string_fields(): # h5py reads numpy string arrays as contiguous unsigned ints, so we need deepdish here - new_data_dict[field] = dd.io.load(self.infile_name, f'/{record_name}/{field}') + new_data_dict[field] = f[record_name][field] else: raise TypeError(f'Field {field} unrecognized') @@ -436,8 +439,8 @@ def _site_to_array_restructure(self): BorealisUtilities.check_arrays(self.infile_name, new_data_dict, attribute_types, dataset_types, unshared_fields) - dd.io.save(self.outfile_name, new_data_dict, - compression=self.compression) + while h5py.File(self.outfile_name, 'w') as f: + f.create_dataset(new_data_dict, compression=self.compression) except TypeError as err: raise borealis_exceptions.BorealisRestructureError( @@ -487,9 +490,9 @@ def _write_borealis_record(self, record: dict, record_name: str, tmp_filename = self.outfile_name + '.tmp' Path(tmp_filename).touch() - dd.io.save(tmp_filename, record[record_name], - compression=self.compression) - f = dd.io.load(tmp_filename, '/') + while h5py.File(tmp_filename, 'w') as f: + f.create_dataset(record[record_name], compression=self.compression) + cp_cmd = 'h5copy -i {newfile} -o {full_file} -s / -d {dtstr}' cmd = cp_cmd.format(newfile=tmp_filename, full_file=self.outfile_name, dtstr=record_name) diff --git a/pydarnio/borealis/borealis_site.py b/pydarnio/borealis/borealis_site.py index cc9faa8..6470ad2 100644 --- a/pydarnio/borealis/borealis_site.py +++ b/pydarnio/borealis/borealis_site.py @@ -35,7 +35,6 @@ Add compression to bzip2 """ -import deepdish as dd import h5py import logging import os @@ -523,8 +522,9 @@ def _write_borealis_records(self, attribute_types: dict, tmp_filename = self.filename + '.tmp' Path(tmp_filename).touch() for group_name, group_dict in self.records.items(): - dd.io.save(tmp_filename, {str(group_name): group_dict}, - compression=self.compression) + with h5py.File(tmp_filename, 'w') as f: + f.create_dataset(str(group_name), data=group_dict, + compression=self.compression) cp_cmd = 'h5copy -i {newfile} -o {full_file} -s {dtstr} -d {dtstr}' cmd = cp_cmd.format(newfile=tmp_filename, full_file=self.filename, dtstr='/'+str(group_name)) diff --git a/pydarnio/borealis/borealis_utilities.py b/pydarnio/borealis/borealis_utilities.py index 10929c2..0e5856c 100644 --- a/pydarnio/borealis/borealis_utilities.py +++ b/pydarnio/borealis/borealis_utilities.py @@ -28,7 +28,6 @@ """ import logging -import deepdish as dd import h5py import numpy as np import sys @@ -560,8 +559,10 @@ def get_borealis_version(filename: str, record_names, structure: str): """ if structure == 'array': try: - borealis_git_hash = dd.io.load(filename, - group='/borealis_git_hash') + with h5py.File(self.filename, 'r') as f: + records = sorted(list(f.keys())) + borealis_git_hash = records.attrs['borealis_git_hash'] + .decode('utf-8') except ValueError as err: raise borealis_exceptions.BorealisStructureError( ' {} Could not find the borealis_git_hash required to ' @@ -569,9 +570,11 @@ def get_borealis_version(filename: str, record_names, structure: str): ''.format(filename, err)) from err elif structure == 'site': try: - borealis_git_hash = \ - dd.io.load(filename, group='/{}/borealis_git_hash' - ''.format(record_names[0])) + with h5py.File(self.filename, 'r') as f: + records = sorted(list(f.keys())) + first_rec = f[records[0]] + borealis_git_hash = first_rec.attrs['borealis_git_hash'] + .decode('utf-8') except ValueError as err: raise borealis_exceptions.BorealisStructureError( ' {} Could not find the borealis_git_hash required to ' From dce1d6c4b1b55d69d50a98f1221cbe4dad19061d Mon Sep 17 00:00:00 2001 From: carleyjmartin Date: Tue, 21 Mar 2023 10:30:13 -0600 Subject: [PATCH 05/18] git hash in borealis_array working --- pydarnio/borealis/base_format.py | 2 +- pydarnio/borealis/borealis_array.py | 7 +++---- pydarnio/borealis/borealis_restructure.py | 8 ++++---- pydarnio/borealis/borealis_utilities.py | 4 ++-- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py index f1ec6c8..c00bc91 100644 --- a/pydarnio/borealis/base_format.py +++ b/pydarnio/borealis/base_format.py @@ -37,12 +37,12 @@ """ import copy +import h5py import numpy as np from collections import OrderedDict from datetime import datetime from typing import Callable, List -import h5py from pydarnio import borealis_exceptions diff --git a/pydarnio/borealis/borealis_array.py b/pydarnio/borealis/borealis_array.py index 0875406..0ba4045 100644 --- a/pydarnio/borealis/borealis_array.py +++ b/pydarnio/borealis/borealis_array.py @@ -37,6 +37,7 @@ For more information on Borealis data files and how they convert to SDarn files, see: https://borealis.readthedocs.io/en/latest/ """ +import deepdish as dd import h5py import logging @@ -116,9 +117,7 @@ def __init__(self, filename: str, borealis_filetype: str): # 'vX.X' try: with h5py.File(self.filename, 'r') as f: - records = sorted(list(f.keys())) - first_rec = f[records[0]] - full_version = first_rec.attrs['borealis_git_hash'].decode('utf-8').split('-')[0] + full_version = f.attrs['borealis_git_hash'].decode('utf-8').split('-')[0] version = '.'.join(full_version.split('.')[:2]) # vX.Y, ignore patch revision except ValueError as err: raise borealis_exceptions.BorealisStructureError( @@ -284,7 +283,7 @@ def _read_borealis_arrays(self, attribute_types: dict, attr_types = self.format.site_single_element_types() dataset_types = self.format.site_array_dtypes() records = self.format._read_borealis_records - while h5py.File(self.filename, 'r') as arrays: + with h5py.File(self.filename, 'r') as arrays: BorealisUtilities.check_arrays(self.filename, arrays, attribute_types, dataset_types, unshared_fields) diff --git a/pydarnio/borealis/borealis_restructure.py b/pydarnio/borealis/borealis_restructure.py index aed2938..54204e2 100755 --- a/pydarnio/borealis/borealis_restructure.py +++ b/pydarnio/borealis/borealis_restructure.py @@ -219,12 +219,12 @@ def _array_to_site_restructure(self): with hdf5.File(self.infile_name, 'r') as f: for field in self.format.unshared_fields(): if field in self.format.single_element_types(): - unshared_single_elements[field] = f[field] + unshared_single_elements[field] = f[field] with h5py.File(self.infile_name, 'r') as f: records = sorted(list(f.keys())) first_rec = f[records[0]] - sqn_timestamps_array = first_rec.attrs['sqn_timestamps'] + sqn_timestamps_array = first_rec.attrs['sqn_timestamps']\ .decode('utf-8') for record_num, seq_timestamp in enumerate(sqn_timestamps_array): # format dictionary key in the same way it is done @@ -439,7 +439,7 @@ def _site_to_array_restructure(self): BorealisUtilities.check_arrays(self.infile_name, new_data_dict, attribute_types, dataset_types, unshared_fields) - while h5py.File(self.outfile_name, 'w') as f: + with h5py.File(self.outfile_name, 'w') as f: f.create_dataset(new_data_dict, compression=self.compression) except TypeError as err: @@ -490,7 +490,7 @@ def _write_borealis_record(self, record: dict, record_name: str, tmp_filename = self.outfile_name + '.tmp' Path(tmp_filename).touch() - while h5py.File(tmp_filename, 'w') as f: + with h5py.File(tmp_filename, 'w') as f: f.create_dataset(record[record_name], compression=self.compression) cp_cmd = 'h5copy -i {newfile} -o {full_file} -s / -d {dtstr}' diff --git a/pydarnio/borealis/borealis_utilities.py b/pydarnio/borealis/borealis_utilities.py index 0e5856c..3ac1e52 100644 --- a/pydarnio/borealis/borealis_utilities.py +++ b/pydarnio/borealis/borealis_utilities.py @@ -561,7 +561,7 @@ def get_borealis_version(filename: str, record_names, structure: str): try: with h5py.File(self.filename, 'r') as f: records = sorted(list(f.keys())) - borealis_git_hash = records.attrs['borealis_git_hash'] + borealis_git_hash = records.attrs['borealis_git_hash']\ .decode('utf-8') except ValueError as err: raise borealis_exceptions.BorealisStructureError( @@ -573,7 +573,7 @@ def get_borealis_version(filename: str, record_names, structure: str): with h5py.File(self.filename, 'r') as f: records = sorted(list(f.keys())) first_rec = f[records[0]] - borealis_git_hash = first_rec.attrs['borealis_git_hash'] + borealis_git_hash = first_rec.attrs['borealis_git_hash']\ .decode('utf-8') except ValueError as err: raise borealis_exceptions.BorealisStructureError( From b70c9ed5a77ee904feb52802718c78e562315aea Mon Sep 17 00:00:00 2001 From: Remington Rohel Date: Tue, 21 Mar 2023 17:45:23 +0000 Subject: [PATCH 06/18] Modifying expected types based on reading in with h5py. * Can now read in site file and convert to array format. * Still need to fix reading/writing array format. --- pydarnio/borealis/base_format.py | 4 +- pydarnio/borealis/borealis_array.py | 2 +- pydarnio/borealis/borealis_formats.py | 70 ++++++++++++------------- pydarnio/borealis/borealis_utilities.py | 9 ++-- 4 files changed, 44 insertions(+), 41 deletions(-) diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py index c00bc91..5f17b02 100644 --- a/pydarnio/borealis/base_format.py +++ b/pydarnio/borealis/base_format.py @@ -1102,7 +1102,7 @@ class methods used inside this method should be specific datatype = cls.single_element_types()[field] else: # field in array_dtypes datatype = cls.array_dtypes()[field] - if datatype == np.unicode_: + if datatype == str: # unicode type needs to be explicitly set to have # multiple chars (256) datatype='|U256' @@ -1110,7 +1110,7 @@ class methods used inside this method should be specific # Some indices may not be filled due to dimensions that are maximum values (num_sequences, etc. can change # between records), so they are initialized with a known value first. # Initialize floating-point values to NaN, and integer values to -1. - if datatype is np.int64 or datatype is np.uint32: + if datatype is np.int64 or datatype is np.uint32 or datatype is np.uint8: empty_array[:] = -1 else: empty_array[:] = np.NaN diff --git a/pydarnio/borealis/borealis_array.py b/pydarnio/borealis/borealis_array.py index 0ba4045..d955c17 100644 --- a/pydarnio/borealis/borealis_array.py +++ b/pydarnio/borealis/borealis_array.py @@ -119,7 +119,7 @@ def __init__(self, filename: str, borealis_filetype: str): with h5py.File(self.filename, 'r') as f: full_version = f.attrs['borealis_git_hash'].decode('utf-8').split('-')[0] version = '.'.join(full_version.split('.')[:2]) # vX.Y, ignore patch revision - except ValueError as err: + except KeyError as err: raise borealis_exceptions.BorealisStructureError( ' {} Could not find the borealis_git_hash required to ' 'determine read version (file may be site style) {}' diff --git a/pydarnio/borealis/borealis_formats.py b/pydarnio/borealis/borealis_formats.py index a247a85..ccc720e 100644 --- a/pydarnio/borealis/borealis_formats.py +++ b/pydarnio/borealis/borealis_formats.py @@ -262,19 +262,19 @@ def single_element_types(cls): return { # Identifies the version of Borealis that made this data. Necessary # for all versions. - "borealis_git_hash": np.unicode_, + "borealis_git_hash": str, # Number used to identify experiment. "experiment_id": np.int64, # Name of the experiment file. - "experiment_name": np.unicode_, + "experiment_name": str, # Comment about the whole experiment - "experiment_comment": np.unicode_, + "experiment_comment": str, # Additional text comment that describes the slice. - "slice_comment": np.unicode_, + "slice_comment": str, # Number of slices in the experiment at this integration time. "num_slices": np.int64, # Three letter radar identifier. - "station": np.unicode_, + "station": str, # Number of sampling periods in the integration time. "num_sequences": np.int64, # range gate separation (equivalent distance between samples), km. @@ -286,7 +286,7 @@ def single_element_types(cls): # Sampling rate of the samples being written to file in Hz. "rx_sample_rate": np.float64, # Designates if the record is the first in a scan. - "scan_start_marker": np.bool_, + "scan_start_marker": np.uint8, # Integration time in seconds. "int_time": np.float32, # Length of the pulse in microseconds. @@ -302,7 +302,7 @@ def single_element_types(cls): "freq": np.uint32, # str denoting C data type of the samples included in the data # array, such as 'complex float'. - "samples_data_type": np.unicode_, + "samples_data_type": str, # data normalization factor determined by the filter scaling in the # decimation scheme. "data_normalization_factor": np.float64, @@ -672,25 +672,25 @@ def single_element_types(cls): return { # Identifies the version of Borealis that made this data. Necessary # for all versions. - "borealis_git_hash": np.unicode_, + "borealis_git_hash": str, # Number used to identify experiment. "experiment_id": np.int64, # Name of the experiment file. - "experiment_name": np.unicode_, + "experiment_name": str, # Comment about the whole experiment - "experiment_comment": np.unicode_, + "experiment_comment": str, # Additional text comment that describes the slice. - "slice_comment": np.unicode_, + "slice_comment": str, # Number of slices in the experiment at this integration time. "num_slices": np.int64, # Three letter radar identifier. - "station": np.unicode_, + "station": str, # Number of sampling periods in the integration time. "num_sequences": np.int64, # Sampling rate of the samples being written to file in Hz. "rx_sample_rate": np.float64, # Designates if the record is the first in a scan. - "scan_start_marker": np.bool_, + "scan_start_marker": np.uint8, # Integration time in seconds. "int_time": np.float32, # Length of the pulse in microseconds. @@ -706,7 +706,7 @@ def single_element_types(cls): "freq": np.uint32, # str denoting C data type of the samples included in the data # array, such as 'complex float'. - "samples_data_type": np.unicode_, + "samples_data_type": str, # Number of samples in the sampling period. "num_samps": np.uint32, # range gate separation (equivalent distance between samples), km @@ -1074,25 +1074,25 @@ def single_element_types(cls): return { # Identifies the version of Borealis that made this data. Necessary # for all versions. - "borealis_git_hash": np.unicode_, + "borealis_git_hash": str, # Number used to identify experiment. "experiment_id": np.int64, # Name of the experiment file. - "experiment_name": np.unicode_, + "experiment_name": str, # Comment about the whole experiment - "experiment_comment": np.unicode_, + "experiment_comment": str, # Additional text comment that describes the slice. - "slice_comment": np.unicode_, + "slice_comment": str, # Number of slices in the experiment at this integration time. "num_slices": np.int64, # Three letter radar identifier. - "station": np.unicode_, + "station": str, # Number of sampling periods in the integration time. "num_sequences": np.int64, # Sampling rate of the samples being written to file in Hz. "rx_sample_rate": np.float64, # Designates if the record is the first in a scan. - "scan_start_marker": np.bool_, + "scan_start_marker": np.uint8, # Integration time in seconds. "int_time": np.float32, # Length of the pulse in microseconds. @@ -1108,7 +1108,7 @@ def single_element_types(cls): "freq": np.uint32, # str denoting C data type of the samples included in the data # array, such as 'complex float'. - "samples_data_type": np.unicode_, + "samples_data_type": str, # Number of samples in the sampling period. "num_samps": np.uint32, # data normalization factor determined by the filter scaling in the @@ -1402,23 +1402,23 @@ def single_element_types(cls): return { # Identifies the version of Borealis that made this data. Necessary # for all versions. - "borealis_git_hash": np.unicode_, + "borealis_git_hash": str, # Number used to identify experiment. "experiment_id": np.int64, # Name of the experiment file. - "experiment_name": np.unicode_, + "experiment_name": str, # Comment about the whole experiment - "experiment_comment": np.unicode_, + "experiment_comment": str, # Number of slices in the experiment at this integration time. "num_slices": np.int64, # Three letter radar identifier. - "station": np.unicode_, + "station": str, # Number of sampling periods in the integration time. "num_sequences": np.int64, # Sampling rate of the samples being written to file in Hz. "rx_sample_rate": np.float64, # Designates if the record is the first in a scan. - "scan_start_marker": np.bool_, + "scan_start_marker": np.uint8, # Integration time in seconds. "int_time": np.float32, # Number of main array antennas. @@ -1427,7 +1427,7 @@ def single_element_types(cls): "intf_antenna_count": np.uint32, # str denoting C data type of the samples included in the data # array, such as 'complex float'. - "samples_data_type": np.unicode_, + "samples_data_type": str, # The center frequency of this data in kHz "rx_center_freq": np.float64, # Number of samples in the sampling period. @@ -1513,12 +1513,12 @@ def single_element_types(cls): # the slice id of the file and dataset. "slice_id": np.uint32, # the interfacing of this slice to other slices. - "slice_interfacing": np.unicode_, + "slice_interfacing": str, # A string describing the type of scheduling time at the time of # this dataset. - "scheduling_mode": np.unicode_, + "scheduling_mode": str, # A string describing the averaging method, ex. mean, median - "averaging_method": np.unicode_, + "averaging_method": str, # number of blanked samples in the sequence. "num_blanked_samples": np.uint32 }) @@ -1663,10 +1663,10 @@ def single_element_types(cls): # the slice id of the file and dataset. "slice_id": np.uint32, # the interfacing of this slice to other slices. - "slice_interfacing": np.unicode_, + "slice_interfacing": str, # A string describing the type of scheduling time at the time of # this dataset. - "scheduling_mode": np.unicode_, + "scheduling_mode": str, # number of blanked samples in the sequence. "num_blanked_samples": np.uint32 }) @@ -1806,10 +1806,10 @@ def single_element_types(cls): # the slice id of the file and dataset. "slice_id": np.uint32, # the interfacing of this slice to other slices. - "slice_interfacing": np.unicode_, + "slice_interfacing": str, # A string describing the type of scheduling time at the time of # this dataset. - "scheduling_mode": np.unicode_, + "scheduling_mode": str, # number of blanked samples in the sequence. "num_blanked_samples": np.uint32 }) @@ -1955,7 +1955,7 @@ def single_element_types(cls): single_element_types.update({ # A string describing the type of scheduling time at the time of # this dataset. - "scheduling_mode": np.unicode_ + "scheduling_mode": str }) return single_element_types diff --git a/pydarnio/borealis/borealis_utilities.py b/pydarnio/borealis/borealis_utilities.py index 3ac1e52..b6abc16 100644 --- a/pydarnio/borealis/borealis_utilities.py +++ b/pydarnio/borealis/borealis_utilities.py @@ -262,7 +262,8 @@ def record_incorrect_types_check(filename: str, attributes_type_dict: dict, incorrect_types_check = {param: str(attributes_type_dict[param]) for param in attributes_type_dict.keys() if type(record[param]) != - attributes_type_dict[param]} + attributes_type_dict[param] and + record[param].shape is not None} incorrect_types_check.update({param: 'np.ndarray of ' + str(datasets_type_dict[param]) @@ -321,7 +322,8 @@ def array_incorrect_types_check(filename: str, attributes_type_dict: dict, incorrect_types_check = {param: str(attributes_type_dict[param]) for param in attributes_type_dict.keys() if type(file_data[param]) != - attributes_type_dict[param]} + attributes_type_dict[param] and + file_data[param].shape is not None} datasets_type_dict_keys = sorted(list(datasets_type_dict.keys())) np_array_types = [isinstance(file_data[param], np.ndarray) for param in @@ -342,7 +344,8 @@ def array_incorrect_types_check(filename: str, attributes_type_dict: dict, str(datasets_type_dict[param]) for param in datasets_type_dict.keys() if file_data[param].dtype.type != - datasets_type_dict[param]}) + datasets_type_dict[param] and + file_data[param].dtype.type != np.str_}) if len(incorrect_types_check) > 0: raise borealis_exceptions.\ BorealisDataFormatTypeError(filename, From 6fafb57898ce6be5c06d1163c75023a06eecb720 Mon Sep 17 00:00:00 2001 From: Remington Rohel Date: Tue, 21 Mar 2023 19:34:05 +0000 Subject: [PATCH 07/18] Fixed bugs converting from site to array structures. * Bools stored as uint8 * Reading in arrays of strings requires some care --- pydarnio/borealis/borealis_array.py | 12 +++++++++++- pydarnio/borealis/borealis_formats.py | 8 ++++---- pydarnio/borealis/borealis_restructure.py | 22 ++++++++++++++++------ pydarnio/borealis/borealis_utilities.py | 10 ++++------ 4 files changed, 35 insertions(+), 17 deletions(-) diff --git a/pydarnio/borealis/borealis_array.py b/pydarnio/borealis/borealis_array.py index d955c17..8b3719e 100644 --- a/pydarnio/borealis/borealis_array.py +++ b/pydarnio/borealis/borealis_array.py @@ -40,6 +40,7 @@ import deepdish as dd import h5py import logging +import numpy as np from typing import List @@ -525,4 +526,13 @@ def _write_borealis_arrays(self, attribute_types: dict, attribute_types, dataset_types, unshared_fields) with h5py.File(self.filename, 'w') as f: - f.create_dataset(self.arrays, compression=self.compression) + for k, v in self.arrays.items(): + if k in attribute_types: + f.attrs[k] = v + elif v.dtype.type == np.str_: + itemsize = v.dtype.itemsize // 4 # every character is 4 bytes + dset = f.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=self.compression) + dset.attrs['strtype'] = b'unicode' + dset.attrs['itemsize'] = itemsize + else: + f.create_dataset(k, data=v, compression=self.compression) diff --git a/pydarnio/borealis/borealis_formats.py b/pydarnio/borealis/borealis_formats.py index ccc720e..e664756 100644 --- a/pydarnio/borealis/borealis_formats.py +++ b/pydarnio/borealis/borealis_formats.py @@ -2044,7 +2044,7 @@ def single_element_types(cls): "lp_status_word": np.uint32, # Boolean indicating if the GPS was locked during the entire # integration period - "gps_locked": np.bool_, + "gps_locked": np.uint8, # The max time diffe between GPS and system time during the # integration period. In seconds. Negative if GPS time ahead. "gps_to_system_time_diff": np.float64, @@ -2152,7 +2152,7 @@ def single_element_types(cls): "lp_status_word": np.uint32, # Boolean indicating if the GPS was locked during the entire # integration period - "gps_locked": np.bool_, + "gps_locked": np.uint8, # The max time diffe between GPS and system time during the # integration period. In seconds. Negative if GPS time ahead. "gps_to_system_time_diff": np.float64, @@ -2267,7 +2267,7 @@ def single_element_types(cls): "lp_status_word": np.uint32, # Boolean indicating if the GPS was locked during the entire # integration period - "gps_locked": np.bool_, + "gps_locked": np.uint8, # The max time diffe between GPS and system time during the # integration period. In seconds. Negative if GPS time ahead. "gps_to_system_time_diff": np.float64, @@ -2374,7 +2374,7 @@ def single_element_types(cls): "lp_status_word": np.uint32, # Boolean indicating if the GPS was locked during the entire # integration period - "gps_locked": np.bool_, + "gps_locked": np.uint8, # The max time diffe between GPS and system time during the # integration period. In seconds. Negative if GPS time ahead. "gps_to_system_time_diff": np.float64, diff --git a/pydarnio/borealis/borealis_restructure.py b/pydarnio/borealis/borealis_restructure.py index 54204e2..fb583f8 100755 --- a/pydarnio/borealis/borealis_restructure.py +++ b/pydarnio/borealis/borealis_restructure.py @@ -362,8 +362,9 @@ def _site_to_array_restructure(self): else: raise TypeError(f'Field {field} has unrecognized data: {value}') elif field in self.format.array_string_fields(): - # h5py reads numpy string arrays as contiguous unsigned ints, so we need deepdish here - new_data_dict[field] = f[record_name][field] + dset = f[record_name][field] + itemsize = dset.attrs['itemsize'] + new_data_dict[field] = dset[:].view(dtype=(np.unicode_, itemsize)) else: raise TypeError(f'Field {field} unrecognized') @@ -377,7 +378,7 @@ def _site_to_array_restructure(self): # Initialize array now with correct data type. dtype = self.format.single_element_types()[field] new_data_dict[field] = np.empty(num_records, dtype=dtype) - if dtype is np.int64 or dtype is np.uint32: + if dtype is np.int64 or dtype is np.uint32 or dtype is np.uint8: new_data_dict[field][:] = -1 else: new_data_dict[field][:] = np.NaN @@ -399,7 +400,7 @@ def _site_to_array_restructure(self): datatype = self.format.single_element_types()[field] else: # field in array_dtypes datatype = self.format.array_dtypes()[field] - if datatype == np.unicode_: + if datatype == str: # unicode type needs to be explicitly set to # have multiple chars (256) datatype = '|U256' @@ -409,7 +410,7 @@ def _site_to_array_restructure(self): # change between records), so they are initialized # with a known value first. Initialize floating- # point values to NaN, and integer values to -1. - if datatype is np.int64 or datatype is np.uint32: + if datatype is np.int64 or datatype is np.uint32 or datatype is np.uint8: empty_array[:] = -1 else: empty_array[:] = np.NaN @@ -440,7 +441,16 @@ def _site_to_array_restructure(self): attribute_types, dataset_types, unshared_fields) with h5py.File(self.outfile_name, 'w') as f: - f.create_dataset(new_data_dict, compression=self.compression) + for k, v in new_data_dict.items(): + if k in attribute_types: + f.attrs[k] = v + elif v.dtype.type == np.str_: + itemsize = v.dtype.itemsize // 4 # every character is 4 bytes + dset = f.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=self.compression) + dset.attrs['strtype'] = b'unicode' + dset.attrs['itemsize'] = itemsize + else: + f.create_dataset(k, data=v, compression=self.compression) except TypeError as err: raise borealis_exceptions.BorealisRestructureError( diff --git a/pydarnio/borealis/borealis_utilities.py b/pydarnio/borealis/borealis_utilities.py index b6abc16..bc2e636 100644 --- a/pydarnio/borealis/borealis_utilities.py +++ b/pydarnio/borealis/borealis_utilities.py @@ -563,22 +563,20 @@ def get_borealis_version(filename: str, record_names, structure: str): if structure == 'array': try: with h5py.File(self.filename, 'r') as f: - records = sorted(list(f.keys())) - borealis_git_hash = records.attrs['borealis_git_hash']\ - .decode('utf-8') - except ValueError as err: + borealis_git_hash = f.attrs['borealis_git_hash'].decode('utf-8') + except KeyError as err: raise borealis_exceptions.BorealisStructureError( ' {} Could not find the borealis_git_hash required to ' 'determine file version. Data file may be corrupted. {}' ''.format(filename, err)) from err elif structure == 'site': try: - with h5py.File(self.filename, 'r') as f: + with h5py.File(filename, 'r') as f: records = sorted(list(f.keys())) first_rec = f[records[0]] borealis_git_hash = first_rec.attrs['borealis_git_hash']\ .decode('utf-8') - except ValueError as err: + except KeyError as err: raise borealis_exceptions.BorealisStructureError( ' {} Could not find the borealis_git_hash required to ' 'determine file version. Data file may be corrupted. {}' From 9ee37fb51e2c3a080ec8de573418dc121470df38 Mon Sep 17 00:00:00 2001 From: Remington Rohel Date: Tue, 21 Mar 2023 20:40:46 +0000 Subject: [PATCH 08/18] Fixed all the rest of the bugs. * Can now read/write array structured files. * Can also restructure freely between site and array files, both with BorealisRestructure and BorealisRead plus .records or .arrays --- pydarnio/borealis/base_format.py | 53 +++++++++ pydarnio/borealis/borealis_array.py | 50 +------- pydarnio/borealis/borealis_restructure.py | 138 +++++++++++----------- pydarnio/borealis/borealis_site.py | 41 +++---- pydarnio/borealis/borealis_utilities.py | 2 +- 5 files changed, 146 insertions(+), 138 deletions(-) diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py index 5f17b02..b5d23bb 100644 --- a/pydarnio/borealis/base_format.py +++ b/pydarnio/borealis/base_format.py @@ -1286,6 +1286,59 @@ class methods used inside this method should be specific return records + @classmethod + def _read_borealis_arrays(cls, filename: str) -> OrderedDict: + """ + Base function for reading in a Borealis array file. + + Parameters + ---------- + filename: str + Name of the file to load arrays from + + Returns + ------- + OrderedDict + a dict of arrays loaded from an hdf5 Borealis array file + + Raises + ------ + OSError: file does not exist + + Notes + ----- + The results will differ based on the format class, as many of the + class methods used inside this method should be specific + to the format and updated in the child class. + """ + arrays = OrderedDict() + with h5py.File(filename, 'r') as f: + + # Get the datasets (vector fields) + array_names = sorted(list(f.keys())) + for array_name in array_names: + dset = f[array_name] + if 'strtype' in dset.attrs: # string type, requires some handling + itemsize = dset.attrs['itemsize'] + data = dset[:].view(dtype=(np.unicode_, itemsize)) + else: + data = dset[:] # non-string, can simply load + arrays[array_name] = data + + # Get the attributes (scalar fields) + attribute_dict = {k: v for k, v in f.attrs.items()} + attribute_dict.pop('CLASS') # Inherent to HDF5 file + attribute_dict.pop('TITLE') # Inherent to HDF5 file + attribute_dict.pop('VERSION') # Inherent to HDF5 file + attribute_dict.pop('DEEPDISH_IO_VERSION') # Inherent to HDF5 file + attribute_dict.pop('PYTABLES_FORMAT_VERSION') # Inherent to HDF5 file + for k, v in attribute_dict.items(): + if isinstance(v, bytes): + attribute_dict[k] = v.tobytes().decode('utf-8') + arrays.update(attribute_dict) + + return arrays + # STATIC METHODS COMMON ACROSS FORMATS # i.e. common methods that can be used by multiple formats in restructuring # (generally these will be used in the unshared fields dims for arrays) diff --git a/pydarnio/borealis/borealis_array.py b/pydarnio/borealis/borealis_array.py index 8b3719e..5f193ae 100644 --- a/pydarnio/borealis/borealis_array.py +++ b/pydarnio/borealis/borealis_array.py @@ -244,51 +244,13 @@ def read_file(self) -> dict: dataset_types = self.format.array_array_dtypes() unshared_fields = self.format.unshared_fields() - self._read_borealis_arrays(attribute_types, dataset_types, - unshared_fields) - return self._arrays - - def _read_borealis_arrays(self, attribute_types: dict, - dataset_types: dict, - unshared_fields: List[str]): - """ - Read the entire file while checking all data fields. - - Parameters - ---------- - attribute_types: dict - Dictionary with the required types for the attributes in the file. - dataset_types: dict - Dictionary with the require dtypes for the numpy arrays in the - file. - unshared_fields: List[str] - List of fields that are not shared between the records and - therefore should be an array with first dimension = number of - records - - Raises - ------ - BorealisFieldMissingError - when a field is missing from the Borealis - file - BorealisExtraFieldError - when an extra field is present in the - Borealis file - BorealisDataFormatTypeError - when a field has the incorrect - field type for the Borealis file - BorealisNumberOfRecordsError - when the number of records cannot - be discerned from the arrays + arrays = self.format._read_borealis_arrays(self.filename) + BorealisUtilities.check_arrays(self.filename, arrays, + attribute_types, dataset_types, + unshared_fields) + self._arrays = arrays - See Also - -------- - BorealisUtilities - """ - attr_types = self.format.site_single_element_types() - dataset_types = self.format.site_array_dtypes() - records = self.format._read_borealis_records - with h5py.File(self.filename, 'r') as arrays: - BorealisUtilities.check_arrays(self.filename, arrays, - attribute_types, dataset_types, - unshared_fields) - self._arrays = arrays + return self._arrays class BorealisArrayWrite(): diff --git a/pydarnio/borealis/borealis_restructure.py b/pydarnio/borealis/borealis_restructure.py index fb583f8..53a6303 100755 --- a/pydarnio/borealis/borealis_restructure.py +++ b/pydarnio/borealis/borealis_restructure.py @@ -34,7 +34,6 @@ import os import subprocess as sp import warnings -from pathlib import Path import h5py import logging import numpy as np @@ -207,60 +206,64 @@ def _array_to_site_restructure(self): attribute_types = self.format.site_single_element_types() dataset_types = self.format.array_dtypes() try: - shared_fields_dict = dict() - # shared fields are common across records, so this is done once - with hdf5.File(self.infile_name, 'r') as f: + with h5py.File(self.infile_name, 'r') as f: + + # shared fields are common across records, so this is done once + shared_fields_dict = dict() for field in self.format.shared_fields(): - shared_fields_dict[field] = f[field] + if field in attribute_types: + data = f.attrs[field] + if isinstance(data, bytes): + data = str(data) + elif field in self.format.array_string_fields(): + dset = f[field] + itemsize = dset.attrs['itemsize'] + data = dset[:].view(dtype=(np.unicode_, itemsize)) + else: + data = f[field][:] + shared_fields_dict[field] = data - unshared_single_elements = dict() - # These are fields which have one element per record, so the - # arrays are small enough to be loaded completely into memory - with hdf5.File(self.infile_name, 'r') as f: + # These are fields which have one element per record, so the + # arrays are small enough to be loaded completely into memory + unshared_single_elements = dict() for field in self.format.unshared_fields(): if field in self.format.single_element_types(): - unshared_single_elements[field] = f[field] + unshared_single_elements[field] = f[field][:] - with h5py.File(self.infile_name, 'r') as f: - records = sorted(list(f.keys())) - first_rec = f[records[0]] - sqn_timestamps_array = first_rec.attrs['sqn_timestamps']\ - .decode('utf-8') - for record_num, seq_timestamp in enumerate(sqn_timestamps_array): - # format dictionary key in the same way it is done - # in datawrite on site - seq_datetime = datetime.utcfromtimestamp(seq_timestamp[0]) - epoch = datetime.utcfromtimestamp(0) - key = str(int((seq_datetime - epoch).total_seconds() * 1000)) - - # Make this fresh every time, to reduce memory footprint - record_dict = dict() - - # Copy over the shared fields - for k, v in shared_fields_dict.items(): - record_dict[k] = v - - # populate site specific fields using given functions - # that take both the arrays data and the record number - with h5py.File(self.infile_name, 'r') as f: + sqn_timestamps_array = f['sqn_timestamps'][:] + + for record_num, seq_timestamp in enumerate(sqn_timestamps_array): + # format dictionary key in the same way it is done + # in datawrite on site + seq_datetime = datetime.utcfromtimestamp(seq_timestamp[0]) + epoch = datetime.utcfromtimestamp(0) + key = str(int((seq_datetime - epoch).total_seconds() * 1000)) + + # Make this fresh every time, to reduce memory footprint + record_dict = dict() + + # Copy over the shared fields + for k, v in shared_fields_dict.items(): + record_dict[k] = v + + # populate site specific fields using given functions + # that take both the arrays data and the record number for field in self.format.site_specific_fields(): record_dict[field] = \ self.format.site_specific_fields_generate( )[field](f, record_num) - for field in self.format.unshared_fields(): - if field in self.format.single_element_types(): - datatype = self.format.single_element_types()[field] - # field is not an array, single element per record. - # unshared_field_dims_site should give empty list. - record_dict[field] = \ - datatype(unshared_single_elements[field][ - record_num]) - else: # field in array_dtypes - # need to get the dims correct, - # not always equal to the max - field_flag = False - with h5py.File(self.infile_name, 'r') as f: + for field in self.format.unshared_fields(): + if field in self.format.single_element_types(): + datatype = self.format.single_element_types()[field] + # field is not an array, single element per record. + # unshared_field_dims_site should give empty list. + record_dict[field] = \ + datatype(unshared_single_elements[field][ + record_num]) + else: # field in array_dtypes + # need to get the dims correct, not always equal to the max + field_flag = False site_dims = [dimension_function(f, record_num) for dimension_function in self.format.unshared_fields_dims_site( @@ -281,17 +284,16 @@ def _array_to_site_restructure(self): index_slice = tuple(index_slice) # If there was an incorrect dimension (-1 in dims), then use deepdish to extract the field if field_flag: - with h5py.File(self.infile_name) as f: - record_dict[field] = f[field][index_slice] + record_dict[field] = f[field][index_slice] else: record_dict[field] = f[field][index_slice] - # Wrap in another dict to use the format method - record_dict = OrderedDict({key: record_dict}) - record_dict = self.format.flatten_site_arrays(record_dict) + # Wrap in another dict to use the format method + record_dict = OrderedDict({key: record_dict}) + record_dict = self.format.flatten_site_arrays(record_dict) - # Write the single record to file - self._write_borealis_record(record_dict, key, attribute_types, - dataset_types) + # Write the single record to file + self._write_borealis_record(record_dict, key, attribute_types, + dataset_types) except Exception as err: raise borealis_exceptions.BorealisRestructureError( 'Records for {}: Error restructuring {} from array to site ' @@ -488,23 +490,19 @@ def _write_borealis_record(self, record: dict, record_name: str, -------- BorealisUtilities """ - Path(self.outfile_name).touch() BorealisUtilities.check_records(self.infile_name, record, attribute_types, dataset_types) - # use external h5copy utility to move new record into 2hr file. - - warnings.filterwarnings("ignore") - # Must use temporary file to append to a file; writing entire - # dictionary at once also doesn't work so this is required. - tmp_filename = self.outfile_name + '.tmp' - Path(tmp_filename).touch() - - with h5py.File(tmp_filename, 'w') as f: - f.create_dataset(record[record_name], compression=self.compression) - - cp_cmd = 'h5copy -i {newfile} -o {full_file} -s / -d {dtstr}' - cmd = cp_cmd.format(newfile=tmp_filename, full_file=self.outfile_name, - dtstr=record_name) - sp.run(cmd.split()) - os.remove(tmp_filename) + with h5py.File(self.outfile_name, 'a') as f: + for group_name, rec in record.items(): + group = f.create_group(group_name) + for k, v in rec.items(): + if k in attribute_types: + group.attrs[k] = v + elif v.dtype.type == np.str_: + itemsize = v.dtype.itemsize // 4 # every character is 4 bytes + dset = group.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=self.compression) + dset.attrs['strtype'] = b'unicode' + dset.attrs['itemsize'] = itemsize + else: + group.create_dataset(k, data=v, compression=self.compression) diff --git a/pydarnio/borealis/borealis_site.py b/pydarnio/borealis/borealis_site.py index 6470ad2..2ce4ddd 100644 --- a/pydarnio/borealis/borealis_site.py +++ b/pydarnio/borealis/borealis_site.py @@ -40,9 +40,9 @@ import os import subprocess as sp import warnings +import numpy as np from collections import OrderedDict -from pathlib2 import Path from typing import Union from pydarnio import borealis_exceptions, borealis_formats @@ -125,11 +125,10 @@ def __init__(self, filename: str, borealis_filetype: str): try: with h5py.File(self.filename, 'r') as f: - records = sorted(list(f.keys())) - first_rec = f[records[0]] + first_rec = f[self._record_names[0]] full_version = first_rec.attrs['borealis_git_hash'].decode('utf-8').split('-')[0] version = '.'.join(full_version.split('.')[:2]) # vX.Y, ignore patch revision - except (IndexError, ValueError) as err: + except (IndexError, KeyError) as err: # if this is an array style file, it will raise # IndexError on the array. raise borealis_exceptions.BorealisStructureError( @@ -491,9 +490,9 @@ def _write_borealis_records(self, attribute_types: dict, Parameters ---------- - attributes_type_dict: dict + attributes_type: dict Dictionary with the required types for the attributes in the file. - datasets_type_dict: dict + datasets_type: dict Dictionary with the require dtypes for the numpy arrays in the file. @@ -510,23 +509,19 @@ def _write_borealis_records(self, attribute_types: dict, -------- BorealisUtilities """ - Path(self.filename).touch() BorealisUtilities.check_records(self.filename, self.records, attribute_types, dataset_types) - # use external h5copy utility to move new record into 2hr file. - - warnings.filterwarnings("ignore") - # Must use temporary file to append to a file; writing entire - # dictionary at once also doesn't work so this is required. - tmp_filename = self.filename + '.tmp' - Path(tmp_filename).touch() - for group_name, group_dict in self.records.items(): - with h5py.File(tmp_filename, 'w') as f: - f.create_dataset(str(group_name), data=group_dict, - compression=self.compression) - cp_cmd = 'h5copy -i {newfile} -o {full_file} -s {dtstr} -d {dtstr}' - cmd = cp_cmd.format(newfile=tmp_filename, full_file=self.filename, - dtstr='/'+str(group_name)) - sp.call(cmd.split()) - os.remove(tmp_filename) + with h5py.File(self.filename, 'w') as f: + for group_name, group_dict in self.records.items(): + group = f.create_group(str(group_name)) + for k, v in group_dict.items(): + if k in attribute_types.keys(): + group.attrs[k] = v + elif v.dtype.type == np.str_: + itemsize = v.dtype.itemsize // 4 # every character is 4 bytes + dset = group.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=self.compression) + dset.attrs['strtype'] = b'unicode' + dset.attrs['itemsize'] = itemsize + else: + group.create_dataset(k, data=v, compression=self.compression) diff --git a/pydarnio/borealis/borealis_utilities.py b/pydarnio/borealis/borealis_utilities.py index bc2e636..bc7e8fc 100644 --- a/pydarnio/borealis/borealis_utilities.py +++ b/pydarnio/borealis/borealis_utilities.py @@ -562,7 +562,7 @@ def get_borealis_version(filename: str, record_names, structure: str): """ if structure == 'array': try: - with h5py.File(self.filename, 'r') as f: + with h5py.File(filename, 'r') as f: borealis_git_hash = f.attrs['borealis_git_hash'].decode('utf-8') except KeyError as err: raise borealis_exceptions.BorealisStructureError( From 8645560d703d7b5730250a9d3a2fd8984ae34e32 Mon Sep 17 00:00:00 2001 From: Remington Rohel Date: Wed, 22 Mar 2023 15:44:50 +0000 Subject: [PATCH 09/18] Moved writing of records and arrays into base_format.py * Changed the method signature a bit for conciseness * Both methods open HDF5 file with 'a' permission (read/write if exists, create otherwise). This will raise an exception if the file already has groups/datasets with the same name, which I think is worthy of erroring on. * Changed borealis_restructure.py to also use the format writing methods. --- pydarnio/borealis/base_format.py | 77 ++++++++++++++++++++++- pydarnio/borealis/borealis_array.py | 66 ++++--------------- pydarnio/borealis/borealis_restructure.py | 75 +++------------------- pydarnio/borealis/borealis_site.py | 57 ++++------------- 4 files changed, 107 insertions(+), 168 deletions(-) diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py index b5d23bb..d131510 100644 --- a/pydarnio/borealis/base_format.py +++ b/pydarnio/borealis/base_format.py @@ -1230,7 +1230,7 @@ class methods used inside this method should be specific return timestamp_dict @classmethod - def _read_borealis_records(cls, filename: str) -> OrderedDict: + def read_records(cls, filename: str) -> OrderedDict: """ Base function for reading in a Borealis site file. @@ -1287,7 +1287,7 @@ class methods used inside this method should be specific return records @classmethod - def _read_borealis_arrays(cls, filename: str) -> OrderedDict: + def read_arrays(cls, filename: str) -> OrderedDict: """ Base function for reading in a Borealis array file. @@ -1339,6 +1339,79 @@ class methods used inside this method should be specific return arrays + @classmethod + def write_records(cls, filename: str, records: OrderedDict, attribute_types: dict, + dataset_types: dict, compression: str): + """ + Write the file in site style after checking records. + + Several Borealis field checks are done to insure the integrity of the + file. + + Parameters + ---------- + filename: str + Name of the file to write to. + records: OrderedDict + Dictionary containing site-formatted fields to write to file. + attribute_types: dict + Dictionary with the required types for the attributes in the file. + dataset_types: dict + Dictionary with the require dtypes for the numpy arrays in the + file. + compression: str + Type of compression to use for the HDF5 file. + """ + with h5py.File(filename, 'a') as f: + for group_name, group_dict in records.items(): + group = f.create_group(str(group_name)) + for k, v in group_dict.items(): + if k in attribute_types.keys(): + group.attrs[k] = v + elif v.dtype.type == np.str_: + itemsize = v.dtype.itemsize // 4 # every character is 4 bytes + dset = group.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=compression) + dset.attrs['strtype'] = b'unicode' + dset.attrs['itemsize'] = itemsize + else: + group.create_dataset(k, data=v, compression=compression) + + @classmethod + def write_arrays(cls, filename: str, arrays: OrderedDict, attribute_types: dict, + dataset_types: dict, unshared_fields: List[str], compression: str): + """ + Write arrays to file while checking all data fields. + + Parameters + ---------- + filename: str + Name of the file to write to. + arrays: OrderedDict + Dictionary containing array-formatted fields to write to file. + attribute_types: dict + Dictionary with the required types for the attributes in the file. + dataset_types: dict + Dictionary with the require dtypes for the numpy arrays in the + file. + unshared_fields: List[str] + List of fields that are not shared between the records and + therefore should be an array with first dimension = number of + records + compression: str + Type of compression to use for the HDF5 file. + """ + with h5py.File(filename, 'a') as f: + for k, v in arrays.items(): + if k in attribute_types: + f.attrs[k] = v + elif v.dtype.type == np.str_: + itemsize = v.dtype.itemsize // 4 # every character is 4 bytes + dset = f.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=compression) + dset.attrs['strtype'] = b'unicode' + dset.attrs['itemsize'] = itemsize + else: + f.create_dataset(k, data=v, compression=compression) + # STATIC METHODS COMMON ACROSS FORMATS # i.e. common methods that can be used by multiple formats in restructuring # (generally these will be used in the unshared fields dims for arrays) diff --git a/pydarnio/borealis/borealis_array.py b/pydarnio/borealis/borealis_array.py index 5f193ae..e6665b6 100644 --- a/pydarnio/borealis/borealis_array.py +++ b/pydarnio/borealis/borealis_array.py @@ -244,7 +244,7 @@ def read_file(self) -> dict: dataset_types = self.format.array_array_dtypes() unshared_fields = self.format.unshared_fields() - arrays = self.format._read_borealis_arrays(self.filename) + arrays = self.format.read_arrays(self.filename) BorealisUtilities.check_arrays(self.filename, arrays, attribute_types, dataset_types, unshared_fields) @@ -432,7 +432,14 @@ def write_file(self) -> str: Raises ------ - BorealisFileTypeError + BorealisFieldMissingError - when a field is missing from the Borealis + file + BorealisExtraFieldError - when an extra field is present in the + Borealis file + BorealisDataFormatTypeError - when a field has the incorrect + field type for the Borealis file + BorealisNumberOfRecordsError - when the number of records cannot + be discerned from the arrays See Also -------- @@ -446,55 +453,8 @@ def write_file(self) -> str: attribute_types = self.format.array_single_element_types() dataset_types = self.format.array_array_dtypes() unshared_fields = self.format.unshared_fields() - - self._write_borealis_arrays(attribute_types, dataset_types, - unshared_fields) + BorealisUtilities.check_arrays(self.filename, self.arrays, attribute_types, + dataset_types, unshared_fields) + self.format.write_arrays(self.filename, self.arrays, attribute_types, + dataset_types, unshared_fields, self.compression) return self.filename - - def _write_borealis_arrays(self, attribute_types: dict, - dataset_types: dict, - unshared_fields: List[str]): - """ - Write the entire file while checking all data fields. - - Parameters - ---------- - attribute_types: dict - Dictionary with the required types for the attributes in the file. - dataset_types: dict - Dictionary with the require dtypes for the numpy arrays in the - file. - unshared_fields: List[str] - List of fields that are not shared between the records and - therefore should be an array with first dimension = number of - records - - Raises - ------ - BorealisFieldMissingError - when a field is missing from the Borealis - file - BorealisExtraFieldError - when an extra field is present in the - Borealis file - BorealisDataFormatTypeError - when a field has the incorrect - field type for the Borealis file - BorealisNumberOfRecordsError - when the number of records cannot - be discerned from the arrays - - See Also - -------- - BorealisUtilities - """ - BorealisUtilities.check_arrays(self.filename, self.arrays, - attribute_types, dataset_types, - unshared_fields) - with h5py.File(self.filename, 'w') as f: - for k, v in self.arrays.items(): - if k in attribute_types: - f.attrs[k] = v - elif v.dtype.type == np.str_: - itemsize = v.dtype.itemsize // 4 # every character is 4 bytes - dset = f.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=self.compression) - dset.attrs['strtype'] = b'unicode' - dset.attrs['itemsize'] = itemsize - else: - f.create_dataset(k, data=v, compression=self.compression) diff --git a/pydarnio/borealis/borealis_restructure.py b/pydarnio/borealis/borealis_restructure.py index 53a6303..7514bfe 100755 --- a/pydarnio/borealis/borealis_restructure.py +++ b/pydarnio/borealis/borealis_restructure.py @@ -282,18 +282,16 @@ def _array_to_site_restructure(self): index_slice = [slice(0, i) for i in site_dims if i != -1] index_slice.insert(0, record_num) index_slice = tuple(index_slice) - # If there was an incorrect dimension (-1 in dims), then use deepdish to extract the field - if field_flag: - record_dict[field] = f[field][index_slice] - else: - record_dict[field] = f[field][index_slice] + record_dict[field] = f[field][index_slice] + # Wrap in another dict to use the format method record_dict = OrderedDict({key: record_dict}) record_dict = self.format.flatten_site_arrays(record_dict) + BorealisUtilities.check_records(self.infile_name, record_dict, attribute_types, dataset_types) # Write the single record to file - self._write_borealis_record(record_dict, key, attribute_types, - dataset_types) + self.format.write_records(self.outfile_name, record_dict, attribute_types, dataset_types, + self.compression) except Exception as err: raise borealis_exceptions.BorealisRestructureError( 'Records for {}: Error restructuring {} from array to site ' @@ -439,70 +437,13 @@ def _site_to_array_restructure(self): attribute_types = self.format.array_single_element_types() dataset_types = self.format.array_array_dtypes() unshared_fields = self.format.unshared_fields() - BorealisUtilities.check_arrays(self.infile_name, new_data_dict, - attribute_types, dataset_types, + BorealisUtilities.check_arrays(self.infile_name, new_data_dict, attribute_types, dataset_types, unshared_fields) - with h5py.File(self.outfile_name, 'w') as f: - for k, v in new_data_dict.items(): - if k in attribute_types: - f.attrs[k] = v - elif v.dtype.type == np.str_: - itemsize = v.dtype.itemsize // 4 # every character is 4 bytes - dset = f.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=self.compression) - dset.attrs['strtype'] = b'unicode' - dset.attrs['itemsize'] = itemsize - else: - f.create_dataset(k, data=v, compression=self.compression) + self.format.write_arrays(self.outfile_name, new_data_dict, attribute_types, dataset_types, unshared_fields, + self.compression) except TypeError as err: raise borealis_exceptions.BorealisRestructureError( 'Records for {}: Error restructuring {} from site to array ' 'style: {}'.format(self.infile_name, self.format.__name__, err) ) from err - - def _write_borealis_record(self, record: dict, record_name: str, - attribute_types: dict, dataset_types: dict): - """ - Add a record to the output file in site style after checking the record. - - Several Borealis field checks are done to insure the integrity of the - record. - - Parameters - ---------- - record: dict - Dictionary containing the site-structured record. - record_name: str - Group name of the record for the HDF5 hierarchy. - attribute_types: dict - Dictionary with the required types for the attributes in the file. - dataset_types: dict - Dictionary with the required dtypes for the numpy arrays in the - file. - - Raises - ------ - BorealisFieldMissingError - BorealisExtraFieldError - BorealisDataFormatTypeError - - See Also - -------- - BorealisUtilities - """ - BorealisUtilities.check_records(self.infile_name, record, - attribute_types, dataset_types) - - with h5py.File(self.outfile_name, 'a') as f: - for group_name, rec in record.items(): - group = f.create_group(group_name) - for k, v in rec.items(): - if k in attribute_types: - group.attrs[k] = v - elif v.dtype.type == np.str_: - itemsize = v.dtype.itemsize // 4 # every character is 4 bytes - dset = group.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=self.compression) - dset.attrs['strtype'] = b'unicode' - dset.attrs['itemsize'] = itemsize - else: - group.create_dataset(k, data=v, compression=self.compression) diff --git a/pydarnio/borealis/borealis_site.py b/pydarnio/borealis/borealis_site.py index 2ce4ddd..473ae4a 100644 --- a/pydarnio/borealis/borealis_site.py +++ b/pydarnio/borealis/borealis_site.py @@ -267,7 +267,7 @@ def read_file(self) -> dict: attribute_types = self.format.site_single_element_types() dataset_types = self.format.site_array_dtypes() - records = self.format._read_borealis_records(self.filename) + records = self.format.read_records(self.filename) BorealisUtilities.check_records(self.filename, records, attribute_types, dataset_types) @@ -467,34 +467,8 @@ def write_file(self) -> str: Returns ------- - filename + filename: str The filename written to. - """ - pyDARNio_log.info("Writing Borealis {} {} file: {}" - "".format(self.software_version, - self.borealis_filetype, self.filename)) - - attribute_types = self.format.site_single_element_types() - dataset_types = self.format.site_array_dtypes() - - self._write_borealis_records(attribute_types, dataset_types) - return self.filename - - def _write_borealis_records(self, attribute_types: dict, - dataset_types: dict): - """ - Write the file in site style after checking records. - - Several Borealis field checks are done to insure the integrity of the - file. - - Parameters - ---------- - attributes_type: dict - Dictionary with the required types for the attributes in the file. - datasets_type: dict - Dictionary with the require dtypes for the numpy arrays in the - file. Raises ------ @@ -504,24 +478,15 @@ def _write_borealis_records(self, attribute_types: dict, Borealis file/stream type BorealisDataFormatTypeError - when a field has the incorrect field type for the Borealis file/stream type - - See Also - -------- - BorealisUtilities """ + pyDARNio_log.info("Writing Borealis {} {} file: {}" + "".format(self.software_version, + self.borealis_filetype, self.filename)) + + attribute_types = self.format.site_single_element_types() + dataset_types = self.format.site_array_dtypes() BorealisUtilities.check_records(self.filename, self.records, attribute_types, dataset_types) - - with h5py.File(self.filename, 'w') as f: - for group_name, group_dict in self.records.items(): - group = f.create_group(str(group_name)) - for k, v in group_dict.items(): - if k in attribute_types.keys(): - group.attrs[k] = v - elif v.dtype.type == np.str_: - itemsize = v.dtype.itemsize // 4 # every character is 4 bytes - dset = group.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=self.compression) - dset.attrs['strtype'] = b'unicode' - dset.attrs['itemsize'] = itemsize - else: - group.create_dataset(k, data=v, compression=self.compression) + self.format.write_records(self.filename, self.records, attribute_types, + dataset_types, self.compression) + return self.filename From d4fb227b49610507ba5170701dd7485944558a04 Mon Sep 17 00:00:00 2001 From: Remington Rohel Date: Wed, 22 Mar 2023 16:28:35 +0000 Subject: [PATCH 10/18] Handle empty attributes * experiment_comment and slice_comment fields are sometimes empty, so we need to handle them in order to convert to DMAP --- pydarnio/borealis/base_format.py | 40 +++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py index d131510..494496f 100644 --- a/pydarnio/borealis/base_format.py +++ b/pydarnio/borealis/base_format.py @@ -1273,13 +1273,20 @@ class methods used inside this method should be specific rec_dict[dset_name] = data # Get the attributes (scalar fields) - attribute_dict = {k: v for k, v in group.attrs.items()} - attribute_dict.pop('CLASS') # Inherent to HDF5 file - attribute_dict.pop('TITLE') # Inherent to HDF5 file - attribute_dict.pop('VERSION') # Inherent to HDF5 file - for k, v in attribute_dict.items(): - if isinstance(v, bytes): + attribute_dict = {} + for k, v in group.attrs.items(): + if k in ['CLASS', 'TITLE', 'VERSION']: + continue + elif isinstance(v, bytes): attribute_dict[k] = v.tobytes().decode('utf-8') + elif isinstance(v, h5py.Empty): + dtype = v.dtype.type + data = dtype() + if isinstance(data, bytes): + data = data.decode('utf-8') + attribute_dict[k] = data + else: + attribute_dict[k] = v rec_dict.update(attribute_dict) records[rec_key] = rec_dict @@ -1326,15 +1333,20 @@ class methods used inside this method should be specific arrays[array_name] = data # Get the attributes (scalar fields) - attribute_dict = {k: v for k, v in f.attrs.items()} - attribute_dict.pop('CLASS') # Inherent to HDF5 file - attribute_dict.pop('TITLE') # Inherent to HDF5 file - attribute_dict.pop('VERSION') # Inherent to HDF5 file - attribute_dict.pop('DEEPDISH_IO_VERSION') # Inherent to HDF5 file - attribute_dict.pop('PYTABLES_FORMAT_VERSION') # Inherent to HDF5 file - for k, v in attribute_dict.items(): - if isinstance(v, bytes): + attribute_dict = {} + for k, v in f.attrs.items(): + if k in ['CLASS', 'TITLE', 'VERSION', 'DEEPDISH_IO_VERSION', 'PYTABLES_FORMAT_VERSION']: + continue + elif isinstance(v, bytes): attribute_dict[k] = v.tobytes().decode('utf-8') + elif isinstance(v, h5py.Empty): + dtype = v.dtype.type + data = dtype() + if isinstance(data, bytes): + data = data.decode('utf-8') + attribute_dict[k] = data + else: + attribute_dict[k] = v arrays.update(attribute_dict) return arrays From ad699f1a38a9a3092cfac927e949a3f15dc8efc4 Mon Sep 17 00:00:00 2001 From: Remington Rohel Date: Wed, 22 Mar 2023 19:26:56 +0000 Subject: [PATCH 11/18] Fixed conversion to DMAP. * Had to deal more carefully with empty string attributes, and saving string attributes to file. --- pydarnio/borealis/base_format.py | 10 ++++++++-- pydarnio/borealis/borealis_restructure.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py index 494496f..1042b77 100644 --- a/pydarnio/borealis/base_format.py +++ b/pydarnio/borealis/base_format.py @@ -1379,7 +1379,10 @@ def write_records(cls, filename: str, records: OrderedDict, attribute_types: dic group = f.create_group(str(group_name)) for k, v in group_dict.items(): if k in attribute_types.keys(): - group.attrs[k] = v + if isinstance(v, str): + group.attrs[k] = np.bytes_(v) + else: + group.attrs[k] = v elif v.dtype.type == np.str_: itemsize = v.dtype.itemsize // 4 # every character is 4 bytes dset = group.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=compression) @@ -1415,7 +1418,10 @@ def write_arrays(cls, filename: str, arrays: OrderedDict, attribute_types: dict, with h5py.File(filename, 'a') as f: for k, v in arrays.items(): if k in attribute_types: - f.attrs[k] = v + if isinstance(v, str): + f.attrs[k] = np.bytes_(v) + else: + f.attrs[k] = v elif v.dtype.type == np.str_: itemsize = v.dtype.itemsize // 4 # every character is 4 bytes dset = f.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=compression) diff --git a/pydarnio/borealis/borealis_restructure.py b/pydarnio/borealis/borealis_restructure.py index 7514bfe..a3cfc3c 100755 --- a/pydarnio/borealis/borealis_restructure.py +++ b/pydarnio/borealis/borealis_restructure.py @@ -214,7 +214,7 @@ def _array_to_site_restructure(self): if field in attribute_types: data = f.attrs[field] if isinstance(data, bytes): - data = str(data) + data = data.decode('utf-8') elif field in self.format.array_string_fields(): dset = f[field] itemsize = dset.attrs['itemsize'] From 8762d226d3a02b1b7232faa65fda895b0398d737 Mon Sep 17 00:00:00 2001 From: RemingtonRohel <77300402+RemingtonRohel@users.noreply.github.com> Date: Thu, 23 Mar 2023 14:13:58 +0000 Subject: [PATCH 12/18] Remove deepdish dependency from setup.py Co-authored-by: Adam Lozinsky --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1af436b..7f1901d 100644 --- a/setup.py +++ b/setup.py @@ -49,7 +49,7 @@ author="SuperDARN", include_package_data=True, setup_requires=['pyyaml', 'numpy', - 'h5py>=3.3.0', 'deepdish', 'pathlib2'], + 'h5py>=3.3.0', 'pathlib2'], # pyyaml library install install_requires=['pyyaml', 'numpy', 'h5py>=3.3.0', 'deepdish', 'pathlib2'] From 3787aa9ea406a428b5d2fed62702f3f3e97a8973 Mon Sep 17 00:00:00 2001 From: RemingtonRohel <77300402+RemingtonRohel@users.noreply.github.com> Date: Thu, 23 Mar 2023 14:14:37 +0000 Subject: [PATCH 13/18] Minor typo in docstring Co-authored-by: Theodore Kolkman <90067549+tjk584@users.noreply.github.com> --- pydarnio/borealis/base_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py index 1042b77..8f4a2a5 100644 --- a/pydarnio/borealis/base_format.py +++ b/pydarnio/borealis/base_format.py @@ -1357,7 +1357,7 @@ def write_records(cls, filename: str, records: OrderedDict, attribute_types: dic """ Write the file in site style after checking records. - Several Borealis field checks are done to insure the integrity of the + Several Borealis field checks are done to ensure the integrity of the file. Parameters From 7f0ddd2f233221992df25cd07b79c4b9bc407953 Mon Sep 17 00:00:00 2001 From: RemingtonRohel <77300402+RemingtonRohel@users.noreply.github.com> Date: Thu, 23 Mar 2023 14:15:26 +0000 Subject: [PATCH 14/18] Remove deepdish import Co-authored-by: Theodore Kolkman <90067549+tjk584@users.noreply.github.com> --- pydarnio/borealis/borealis_array.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pydarnio/borealis/borealis_array.py b/pydarnio/borealis/borealis_array.py index e6665b6..8dbcf9c 100644 --- a/pydarnio/borealis/borealis_array.py +++ b/pydarnio/borealis/borealis_array.py @@ -37,7 +37,6 @@ For more information on Borealis data files and how they convert to SDarn files, see: https://borealis.readthedocs.io/en/latest/ """ -import deepdish as dd import h5py import logging import numpy as np From f6df969417b4036204fed08546a06ae752a8cdb2 Mon Sep 17 00:00:00 2001 From: RemingtonRohel <77300402+RemingtonRohel@users.noreply.github.com> Date: Thu, 23 Mar 2023 15:54:28 +0000 Subject: [PATCH 15/18] Brevity in type checking Co-authored-by: Adam Lozinsky --- pydarnio/borealis/base_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py index 8f4a2a5..4244edb 100644 --- a/pydarnio/borealis/base_format.py +++ b/pydarnio/borealis/base_format.py @@ -1110,7 +1110,7 @@ class methods used inside this method should be specific # Some indices may not be filled due to dimensions that are maximum values (num_sequences, etc. can change # between records), so they are initialized with a known value first. # Initialize floating-point values to NaN, and integer values to -1. - if datatype is np.int64 or datatype is np.uint32 or datatype is np.uint8: + if datatype in [np.int64, np.uint32, np.uint8]: empty_array[:] = -1 else: empty_array[:] = np.NaN From eab4a6c8281465d5c181e752220b149c206372ab Mon Sep 17 00:00:00 2001 From: RemingtonRohel <77300402+RemingtonRohel@users.noreply.github.com> Date: Thu, 23 Mar 2023 15:54:41 +0000 Subject: [PATCH 16/18] Brevity in type checking Co-authored-by: Adam Lozinsky --- pydarnio/borealis/borealis_restructure.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydarnio/borealis/borealis_restructure.py b/pydarnio/borealis/borealis_restructure.py index a3cfc3c..67f49aa 100755 --- a/pydarnio/borealis/borealis_restructure.py +++ b/pydarnio/borealis/borealis_restructure.py @@ -410,7 +410,7 @@ def _site_to_array_restructure(self): # change between records), so they are initialized # with a known value first. Initialize floating- # point values to NaN, and integer values to -1. - if datatype is np.int64 or datatype is np.uint32 or datatype is np.uint8: + if datatype in [np.int64, np.uint32, np.uint8]: empty_array[:] = -1 else: empty_array[:] = np.NaN From 562c8aaa7a38bef5b3a2b46f23429383ee1000f1 Mon Sep 17 00:00:00 2001 From: RemingtonRohel <77300402+RemingtonRohel@users.noreply.github.com> Date: Thu, 23 Mar 2023 15:54:56 +0000 Subject: [PATCH 17/18] Brevity in type checking Co-authored-by: Adam Lozinsky --- pydarnio/borealis/borealis_restructure.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydarnio/borealis/borealis_restructure.py b/pydarnio/borealis/borealis_restructure.py index 67f49aa..536ea61 100755 --- a/pydarnio/borealis/borealis_restructure.py +++ b/pydarnio/borealis/borealis_restructure.py @@ -378,7 +378,7 @@ def _site_to_array_restructure(self): # Initialize array now with correct data type. dtype = self.format.single_element_types()[field] new_data_dict[field] = np.empty(num_records, dtype=dtype) - if dtype is np.int64 or dtype is np.uint32 or dtype is np.uint8: + if dtype in [np.int64, np.uint32, np.uint8]: new_data_dict[field][:] = -1 else: new_data_dict[field][:] = np.NaN From 8a7418f01fc0dbbcdfbd52380848070bd6e22c39 Mon Sep 17 00:00:00 2001 From: Remington Rohel Date: Thu, 23 Mar 2023 17:02:53 +0000 Subject: [PATCH 18/18] Pop other deepdish / pytables fields when reading site files. --- pydarnio/borealis/base_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py index 4244edb..eba9bff 100644 --- a/pydarnio/borealis/base_format.py +++ b/pydarnio/borealis/base_format.py @@ -1275,7 +1275,7 @@ class methods used inside this method should be specific # Get the attributes (scalar fields) attribute_dict = {} for k, v in group.attrs.items(): - if k in ['CLASS', 'TITLE', 'VERSION']: + if k in ['CLASS', 'TITLE', 'VERSION', 'DEEPDISH_IO_VERSION', 'PYTABLES_FORMAT_VERSION']: continue elif isinstance(v, bytes): attribute_dict[k] = v.tobytes().decode('utf-8')