From 0790ddb9f88cc87d87bc4af6fb6ffd0fe38c81d6 Mon Sep 17 00:00:00 2001
From: Remington Rohel <rar129@usask.ca>
Date: Fri, 17 Mar 2023 18:15:39 +0000
Subject: [PATCH 01/18] First crack at switching to h5py for reading in
 Borealis site files.

* Haven't added any code for dealing with data_descriptors or correlation_descriptors fields (they are finnicky)
* Updated h5py dependency to need >= v3.3.0
* Have not tested whatsoever
---
 pydarnio/borealis/base_format.py   | 53 ++++++++++++++++++++++++++++++
 pydarnio/borealis/borealis_site.py | 46 ++++++++------------------
 setup.py                           |  4 +--
 3 files changed, 69 insertions(+), 34 deletions(-)

diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py
index ca8e97d..297c943 100644
--- a/pydarnio/borealis/base_format.py
+++ b/pydarnio/borealis/base_format.py
@@ -1229,6 +1229,59 @@ class methods used inside this method should be specific
 
         return timestamp_dict
 
+    @classmethod
+    def _read_borealis_records(cls, filename: str) -> OrderedDict:
+        """
+        Base function for reading in a Borealis site file.
+
+        Parameters
+        ----------
+        filename: str
+            Name of the file to load records from
+
+        Returns
+        -------
+        OrderedDict
+            a dict of timestamped records loaded from an hdf5 Borealis site file
+
+        Raises
+        ------
+        OSError: file does not exist
+
+        Notes
+        -----
+        The results will differ based on the format class, as many of the
+        class methods used inside this method should be specific
+        to the format and updated in the child class.
+        """
+        records = OrderedDict()
+        with h5py.File(filename, 'r') as f:
+            record_keys = sorted(list(f.keys()))
+            for rec_key in record_keys:
+                rec_dict = {}
+                group = f[rec_key]
+
+                # Get the datasets (vector fields)
+                datasets = list(group.keys())
+                for dset_name in datasets:
+                    dset = group[dset_name][:]
+                    # TODO: Handle data_descriptors, correlation_descriptors fields (they are gross)
+                    rec_dict[dset_name] = dset
+
+                # Get the attributes (scalar fields)
+                attribute_dict = {k: v for k, v in group.attrs.items()}
+                attribute_dict.pop('CLASS')       # Inherent to HDF5 file
+                attribute_dict.pop('TITLE')       # Inherent to HDF5 file
+                attribute_dict.pop('VERSION')     # Inherent to HDF5 file
+                for k, v in attribute_dict.items():
+                    if isinstance(v, bytes):
+                        attribute_dict[k] = v.tobytes().decode('utf-8')
+                rec_dict.update(attribute_dict)
+
+                records[rec_key] = rec_dict
+
+        return records
+
     # STATIC METHODS COMMON ACROSS FORMATS
     # i.e. common methods that can be used by multiple formats in restructuring
     # (generally these will be used in the unshared fields dims for arrays)
diff --git a/pydarnio/borealis/borealis_site.py b/pydarnio/borealis/borealis_site.py
index 0c5b2b3..b20d02d 100644
--- a/pydarnio/borealis/borealis_site.py
+++ b/pydarnio/borealis/borealis_site.py
@@ -125,10 +125,11 @@ def __init__(self, filename: str, borealis_filetype: str):
         # 'vX.X'
 
         try:
-            version = dd.io.load(self.filename,
-                                 group='/'+self._record_names[0]
-                                 )['borealis_git_hash'].split('-')[0]
-            version = '.'.join(version.split('.')[:2])      # vX.Y, ignore patch revision
+            with h5py.File(self.filename, 'r') as f:
+                records = sorted(list(f.keys()))
+                first_rec = f[records[0]]
+                full_version = first_rec.attrs['borealis_git_hash'].decode('utf-8').split('-')[0]
+                version = '.'.join(version.split('.')[:2])      # vX.Y, ignore patch revision
         except (IndexError, ValueError) as err:
             # if this is an array style file, it will raise
             # IndexError on the array.
@@ -247,36 +248,9 @@ def read_file(self) -> dict:
         records: OrderedDict{dict}
             records of Borealis rawacf data. Keys are first sequence timestamp
             (in ms since epoch).
-        """
-        pyDARNio_log.info("Reading Borealis {} {} file: {}"
-                          "".format(self.software_version,
-                                    self.borealis_filetype, self.filename))
-
-        attribute_types = self.format.site_single_element_types()
-        dataset_types = self.format.site_array_dtypes()
-
-        self._read_borealis_records(attribute_types, dataset_types)
-        return self._records
-
-    def _read_borealis_records(self, attribute_types: dict,
-                               dataset_types: dict):
-        """
-        Read the entire file while checking all data fields.
-
-        Several Borealis field checks are done to insure the integrity of the
-        file.
-
-        Parameters
-        ----------
-        attribute_types: dict
-            Dictionary with the required types for the attributes in the file.
-        dataset_types: dict
-            Dictionary with the require dtypes for the numpy arrays in the
-            file.
 
         Raises
         ------
-        OSError: file does not exist
         BorealisFieldMissingError - when a field is missing from the Borealis
                                 file/stream type
         BorealisExtraFieldError - when an extra field is present in the
@@ -288,11 +262,19 @@ def _read_borealis_records(self, attribute_types: dict,
         --------
         BorealisUtilities
         """
-        records = dd.io.load(self.filename)
+        pyDARNio_log.info("Reading Borealis {} {} file: {}"
+                          "".format(self.software_version,
+                                    self.borealis_filetype, self.filename))
+
+        attribute_types = self.format.site_single_element_types()
+        dataset_types = self.format.site_array_dtypes()
+
+        records = self.format._read_borealis_records(self.filename)
         BorealisUtilities.check_records(self.filename, records,
                                         attribute_types, dataset_types)
 
         self._records = OrderedDict(sorted(records.items()))
+        return self._records
 
 
 class BorealisSiteWrite():
diff --git a/setup.py b/setup.py
index 7034341..1af436b 100644
--- a/setup.py
+++ b/setup.py
@@ -49,8 +49,8 @@
     author="SuperDARN",
     include_package_data=True,
     setup_requires=['pyyaml', 'numpy',
-                    'h5py', 'deepdish', 'pathlib2'],
+                    'h5py>=3.3.0', 'deepdish', 'pathlib2'],
     # pyyaml library install
     install_requires=['pyyaml', 'numpy',
-                      'h5py', 'deepdish', 'pathlib2']
+                      'h5py>=3.3.0', 'deepdish', 'pathlib2']
 )

From 7c53e99eef0134a763ff29708e7b5fe43d303d75 Mon Sep 17 00:00:00 2001
From: Remington Rohel <rar129@usask.ca>
Date: Mon, 20 Mar 2023 15:34:25 +0000
Subject: [PATCH 02/18] Added logic for unpacking deepdish strings from HDF5
 files.

---
 pydarnio/borealis/base_format.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py
index 297c943..f1ec6c8 100644
--- a/pydarnio/borealis/base_format.py
+++ b/pydarnio/borealis/base_format.py
@@ -1264,9 +1264,13 @@ class methods used inside this method should be specific
                 # Get the datasets (vector fields)
                 datasets = list(group.keys())
                 for dset_name in datasets:
-                    dset = group[dset_name][:]
-                    # TODO: Handle data_descriptors, correlation_descriptors fields (they are gross)
-                    rec_dict[dset_name] = dset
+                    dset = group[dset_name]
+                    if 'strtype' in dset.attrs:     # string type, requires some handling
+                        itemsize = dset.attrs['itemsize']
+                        data = dset[:].view(dtype=(np.unicode_, itemsize))
+                    else:
+                        data = dset[:]      # non-string, can simply load
+                    rec_dict[dset_name] = data
 
                 # Get the attributes (scalar fields)
                 attribute_dict = {k: v for k, v in group.attrs.items()}

From 0ef1553401dead4907522c40edfac534377daf29 Mon Sep 17 00:00:00 2001
From: Remington Rohel <rar129@usask.ca>
Date: Mon, 20 Mar 2023 15:54:19 +0000
Subject: [PATCH 03/18] Fixed a bug with reading in the version.

---
 pydarnio/borealis/borealis_site.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydarnio/borealis/borealis_site.py b/pydarnio/borealis/borealis_site.py
index b20d02d..cc9faa8 100644
--- a/pydarnio/borealis/borealis_site.py
+++ b/pydarnio/borealis/borealis_site.py
@@ -129,7 +129,7 @@ def __init__(self, filename: str, borealis_filetype: str):
                 records = sorted(list(f.keys()))
                 first_rec = f[records[0]]
                 full_version = first_rec.attrs['borealis_git_hash'].decode('utf-8').split('-')[0]
-                version = '.'.join(version.split('.')[:2])      # vX.Y, ignore patch revision
+                version = '.'.join(full_version.split('.')[:2])      # vX.Y, ignore patch revision
         except (IndexError, ValueError) as err:
             # if this is an array style file, it will raise
             # IndexError on the array.

From c0df902c0962716e8104895963f9ba8f3c144519 Mon Sep 17 00:00:00 2001
From: carleyjmartin <carleymartin@hotmail.com>
Date: Mon, 20 Mar 2023 16:20:35 -0600
Subject: [PATCH 04/18] all instances of dd changed to h5py, not tested

---
 .gitignore                                |  1 +
 pydarnio/borealis/borealis_array.py       | 26 +++++++++------
 pydarnio/borealis/borealis_convert.py     |  1 -
 pydarnio/borealis/borealis_restructure.py | 39 ++++++++++++-----------
 pydarnio/borealis/borealis_site.py        |  6 ++--
 pydarnio/borealis/borealis_utilities.py   | 15 +++++----
 6 files changed, 50 insertions(+), 38 deletions(-)

diff --git a/.gitignore b/.gitignore
index b6e4761..ed783b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 __pycache__/
 *.py[cod]
 *$py.class
+*.DS_Store
 
 # C extensions
 *.so
diff --git a/pydarnio/borealis/borealis_array.py b/pydarnio/borealis/borealis_array.py
index 4ff6afd..0875406 100644
--- a/pydarnio/borealis/borealis_array.py
+++ b/pydarnio/borealis/borealis_array.py
@@ -37,7 +37,7 @@
 For more information on Borealis data files and how they convert to SDarn
 files, see: https://borealis.readthedocs.io/en/latest/
 """
-import deepdish as dd
+import h5py
 import logging
 
 from typing import List
@@ -115,9 +115,11 @@ def __init__(self, filename: str, borealis_filetype: str):
         # get the version of the file - split by the dash, first part should be
         # 'vX.X'
         try:
-            version = dd.io.load(self.filename,
-                                 group='/borealis_git_hash').split('-')[0]
-            version = '.'.join(version.split('.')[:2])      # vX.Y, ignore patch revision
+            with h5py.File(self.filename, 'r') as f:
+                records = sorted(list(f.keys()))
+                first_rec = f[records[0]]
+                full_version = first_rec.attrs['borealis_git_hash'].decode('utf-8').split('-')[0]
+                version = '.'.join(full_version.split('.')[:2])      # vX.Y, ignore patch revision
         except ValueError as err:
             raise borealis_exceptions.BorealisStructureError(
                 ' {} Could not find the borealis_git_hash required to '
@@ -279,11 +281,14 @@ def _read_borealis_arrays(self, attribute_types: dict,
         --------
         BorealisUtilities
         """
-        arrays = dd.io.load(self.filename)
-        BorealisUtilities.check_arrays(self.filename, arrays,
-                                       attribute_types, dataset_types,
-                                       unshared_fields)
-        self._arrays = arrays
+        attr_types = self.format.site_single_element_types()
+        dataset_types = self.format.site_array_dtypes()
+        records = self.format._read_borealis_records
+        while h5py.File(self.filename, 'r') as arrays:
+            BorealisUtilities.check_arrays(self.filename, arrays,
+                                           attribute_types, dataset_types,
+                                           unshared_fields)
+            self._arrays = arrays
 
 
 class BorealisArrayWrite():
@@ -520,4 +525,5 @@ def _write_borealis_arrays(self, attribute_types: dict,
         BorealisUtilities.check_arrays(self.filename, self.arrays,
                                        attribute_types, dataset_types,
                                        unshared_fields)
-        dd.io.save(self.filename, self.arrays, compression=self.compression)
+        with h5py.File(self.filename, 'w') as f:
+                f.create_dataset(self.arrays, compression=self.compression)
diff --git a/pydarnio/borealis/borealis_convert.py b/pydarnio/borealis/borealis_convert.py
index 785b129..3e8eda3 100644
--- a/pydarnio/borealis/borealis_convert.py
+++ b/pydarnio/borealis/borealis_convert.py
@@ -42,7 +42,6 @@
 """
 import logging
 import numpy as np
-import deepdish as dd
 
 from datetime import datetime
 from typing import Union
diff --git a/pydarnio/borealis/borealis_restructure.py b/pydarnio/borealis/borealis_restructure.py
index 228d608..aed2938 100755
--- a/pydarnio/borealis/borealis_restructure.py
+++ b/pydarnio/borealis/borealis_restructure.py
@@ -36,7 +36,6 @@
 import warnings
 from pathlib import Path
 import h5py
-import deepdish as dd
 import logging
 import numpy as np
 from datetime import datetime
@@ -210,20 +209,23 @@ def _array_to_site_restructure(self):
         try:
             shared_fields_dict = dict()
             # shared fields are common across records, so this is done once
-            for field in self.format.shared_fields():
-                field_data = dd.io.load(self.infile_name, '/{}'.format(field))
-                shared_fields_dict[field] = field_data
+            with hdf5.File(self.infile_name, 'r') as f:
+                for field in self.format.shared_fields():
+                    shared_fields_dict[field] = f[field]
 
             unshared_single_elements = dict()
             # These are fields which have one element per record, so the
             # arrays are small enough to be loaded completely into memory
-            for field in self.format.unshared_fields():
-                if field in self.format.single_element_types():
-                    unshared_single_elements[field] = dd.io.load(
-                        self.infile_name, '/{}'.format(field))
+            with hdf5.File(self.infile_name, 'r') as f:
+                for field in self.format.unshared_fields():
+                    if field in self.format.single_element_types():
+                    unshared_single_elements[field] = f[field]
 
-            sqn_timestamps_array = dd.io.load(self.infile_name,
-                                              '/sqn_timestamps')
+            with h5py.File(self.infile_name, 'r') as f:
+                records = sorted(list(f.keys()))
+                first_rec = f[records[0]]
+                sqn_timestamps_array = first_rec.attrs['sqn_timestamps']
+                                            .decode('utf-8')
             for record_num, seq_timestamp in enumerate(sqn_timestamps_array):
                 # format dictionary key in the same way it is done
                 # in datawrite on site
@@ -279,7 +281,8 @@ def _array_to_site_restructure(self):
                             index_slice = tuple(index_slice)
                             # If there was an incorrect dimension (-1 in dims), then use deepdish to extract the field
                             if field_flag:
-                                record_dict[field] = dd.io.load(self.infile_name, f'/{field}')[index_slice]
+                                with h5py.File(self.infile_name) as f:
+                                    record_dict[field] = f[field][index_slice]
                             else:
                                 record_dict[field] = f[field][index_slice]
                 # Wrap in another dict to use the format method
@@ -333,7 +336,7 @@ def _site_to_array_restructure(self):
                     rec_dict.update({k: record.attrs[k] for k in rec_attrs})
                     # Bitwise fields also need to be handled separately
                     for field in self.format.bool_types():
-                        rec_dict[field] = dd.io.load(self.infile_name, f'/{record_name}/{field}')
+                        rec_dict[field] = f[record_name][field]
 
                     # some fields are linear in site style and need to be reshaped.
                     # Pass in record nested in a dictionary, as
@@ -360,7 +363,7 @@ def _site_to_array_restructure(self):
                                     raise TypeError(f'Field {field} has unrecognized data: {value}')
                             elif field in self.format.array_string_fields():
                                 # h5py reads numpy string arrays as contiguous unsigned ints, so we need deepdish here
-                                new_data_dict[field] = dd.io.load(self.infile_name, f'/{record_name}/{field}')
+                                new_data_dict[field] = f[record_name][field]
                             else:
                                 raise TypeError(f'Field {field} unrecognized')
 
@@ -436,8 +439,8 @@ def _site_to_array_restructure(self):
             BorealisUtilities.check_arrays(self.infile_name, new_data_dict,
                                            attribute_types, dataset_types,
                                            unshared_fields)
-            dd.io.save(self.outfile_name, new_data_dict,
-                       compression=self.compression)
+            while h5py.File(self.outfile_name, 'w') as f:
+                f.create_dataset(new_data_dict, compression=self.compression)
 
         except TypeError as err:
             raise borealis_exceptions.BorealisRestructureError(
@@ -487,9 +490,9 @@ def _write_borealis_record(self, record: dict, record_name: str,
         tmp_filename = self.outfile_name + '.tmp'
         Path(tmp_filename).touch()
 
-        dd.io.save(tmp_filename, record[record_name],
-                   compression=self.compression)
-        f = dd.io.load(tmp_filename, '/')
+        while h5py.File(tmp_filename, 'w') as f:
+            f.create_dataset(record[record_name], compression=self.compression)
+        
         cp_cmd = 'h5copy -i {newfile} -o {full_file} -s / -d {dtstr}'
         cmd = cp_cmd.format(newfile=tmp_filename, full_file=self.outfile_name,
                             dtstr=record_name)
diff --git a/pydarnio/borealis/borealis_site.py b/pydarnio/borealis/borealis_site.py
index cc9faa8..6470ad2 100644
--- a/pydarnio/borealis/borealis_site.py
+++ b/pydarnio/borealis/borealis_site.py
@@ -35,7 +35,6 @@
 Add compression to bzip2
 
 """
-import deepdish as dd
 import h5py
 import logging
 import os
@@ -523,8 +522,9 @@ def _write_borealis_records(self, attribute_types: dict,
         tmp_filename = self.filename + '.tmp'
         Path(tmp_filename).touch()
         for group_name, group_dict in self.records.items():
-            dd.io.save(tmp_filename, {str(group_name): group_dict},
-                       compression=self.compression)
+            with h5py.File(tmp_filename, 'w') as f:
+                f.create_dataset(str(group_name), data=group_dict,
+                                 compression=self.compression)
             cp_cmd = 'h5copy -i {newfile} -o {full_file} -s {dtstr} -d {dtstr}'
             cmd = cp_cmd.format(newfile=tmp_filename, full_file=self.filename,
                                 dtstr='/'+str(group_name))
diff --git a/pydarnio/borealis/borealis_utilities.py b/pydarnio/borealis/borealis_utilities.py
index 10929c2..0e5856c 100644
--- a/pydarnio/borealis/borealis_utilities.py
+++ b/pydarnio/borealis/borealis_utilities.py
@@ -28,7 +28,6 @@
 
 """
 import logging
-import deepdish as dd
 import h5py
 import numpy as np
 import sys
@@ -560,8 +559,10 @@ def get_borealis_version(filename: str, record_names, structure: str):
         """
         if structure == 'array':
             try:
-                borealis_git_hash = dd.io.load(filename,
-                                               group='/borealis_git_hash')
+                with h5py.File(self.filename, 'r') as f:
+                    records = sorted(list(f.keys()))
+                    borealis_git_hash = records.attrs['borealis_git_hash']
+                                            .decode('utf-8')
             except ValueError as err:
                 raise borealis_exceptions.BorealisStructureError(
                     ' {} Could not find the borealis_git_hash required to '
@@ -569,9 +570,11 @@ def get_borealis_version(filename: str, record_names, structure: str):
                     ''.format(filename, err)) from err
         elif structure == 'site':
             try:
-                borealis_git_hash = \
-                    dd.io.load(filename, group='/{}/borealis_git_hash'
-                                               ''.format(record_names[0]))
+                with h5py.File(self.filename, 'r') as f:
+                    records = sorted(list(f.keys()))
+                    first_rec = f[records[0]]
+                    borealis_git_hash = first_rec.attrs['borealis_git_hash']
+                                            .decode('utf-8')
             except ValueError as err:
                 raise borealis_exceptions.BorealisStructureError(
                     ' {} Could not find the borealis_git_hash required to '

From dce1d6c4b1b55d69d50a98f1221cbe4dad19061d Mon Sep 17 00:00:00 2001
From: carleyjmartin <carleymartin@hotmail.com>
Date: Tue, 21 Mar 2023 10:30:13 -0600
Subject: [PATCH 05/18] git hash in borealis_array working

---
 pydarnio/borealis/base_format.py          | 2 +-
 pydarnio/borealis/borealis_array.py       | 7 +++----
 pydarnio/borealis/borealis_restructure.py | 8 ++++----
 pydarnio/borealis/borealis_utilities.py   | 4 ++--
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py
index f1ec6c8..c00bc91 100644
--- a/pydarnio/borealis/base_format.py
+++ b/pydarnio/borealis/base_format.py
@@ -37,12 +37,12 @@
 """
 
 import copy
+import h5py
 import numpy as np
 
 from collections import OrderedDict
 from datetime import datetime
 from typing import Callable, List
-import h5py
 
 from pydarnio import borealis_exceptions
 
diff --git a/pydarnio/borealis/borealis_array.py b/pydarnio/borealis/borealis_array.py
index 0875406..0ba4045 100644
--- a/pydarnio/borealis/borealis_array.py
+++ b/pydarnio/borealis/borealis_array.py
@@ -37,6 +37,7 @@
 For more information on Borealis data files and how they convert to SDarn
 files, see: https://borealis.readthedocs.io/en/latest/
 """
+import deepdish as dd
 import h5py
 import logging
 
@@ -116,9 +117,7 @@ def __init__(self, filename: str, borealis_filetype: str):
         # 'vX.X'
         try:
             with h5py.File(self.filename, 'r') as f:
-                records = sorted(list(f.keys()))
-                first_rec = f[records[0]]
-                full_version = first_rec.attrs['borealis_git_hash'].decode('utf-8').split('-')[0]
+                full_version = f.attrs['borealis_git_hash'].decode('utf-8').split('-')[0]
                 version = '.'.join(full_version.split('.')[:2])      # vX.Y, ignore patch revision
         except ValueError as err:
             raise borealis_exceptions.BorealisStructureError(
@@ -284,7 +283,7 @@ def _read_borealis_arrays(self, attribute_types: dict,
         attr_types = self.format.site_single_element_types()
         dataset_types = self.format.site_array_dtypes()
         records = self.format._read_borealis_records
-        while h5py.File(self.filename, 'r') as arrays:
+        with h5py.File(self.filename, 'r') as arrays:
             BorealisUtilities.check_arrays(self.filename, arrays,
                                            attribute_types, dataset_types,
                                            unshared_fields)
diff --git a/pydarnio/borealis/borealis_restructure.py b/pydarnio/borealis/borealis_restructure.py
index aed2938..54204e2 100755
--- a/pydarnio/borealis/borealis_restructure.py
+++ b/pydarnio/borealis/borealis_restructure.py
@@ -219,12 +219,12 @@ def _array_to_site_restructure(self):
             with hdf5.File(self.infile_name, 'r') as f:
                 for field in self.format.unshared_fields():
                     if field in self.format.single_element_types():
-                    unshared_single_elements[field] = f[field]
+                        unshared_single_elements[field] = f[field]
 
             with h5py.File(self.infile_name, 'r') as f:
                 records = sorted(list(f.keys()))
                 first_rec = f[records[0]]
-                sqn_timestamps_array = first_rec.attrs['sqn_timestamps']
+                sqn_timestamps_array = first_rec.attrs['sqn_timestamps']\
                                             .decode('utf-8')
             for record_num, seq_timestamp in enumerate(sqn_timestamps_array):
                 # format dictionary key in the same way it is done
@@ -439,7 +439,7 @@ def _site_to_array_restructure(self):
             BorealisUtilities.check_arrays(self.infile_name, new_data_dict,
                                            attribute_types, dataset_types,
                                            unshared_fields)
-            while h5py.File(self.outfile_name, 'w') as f:
+            with h5py.File(self.outfile_name, 'w') as f:
                 f.create_dataset(new_data_dict, compression=self.compression)
 
         except TypeError as err:
@@ -490,7 +490,7 @@ def _write_borealis_record(self, record: dict, record_name: str,
         tmp_filename = self.outfile_name + '.tmp'
         Path(tmp_filename).touch()
 
-        while h5py.File(tmp_filename, 'w') as f:
+        with h5py.File(tmp_filename, 'w') as f:
             f.create_dataset(record[record_name], compression=self.compression)
         
         cp_cmd = 'h5copy -i {newfile} -o {full_file} -s / -d {dtstr}'
diff --git a/pydarnio/borealis/borealis_utilities.py b/pydarnio/borealis/borealis_utilities.py
index 0e5856c..3ac1e52 100644
--- a/pydarnio/borealis/borealis_utilities.py
+++ b/pydarnio/borealis/borealis_utilities.py
@@ -561,7 +561,7 @@ def get_borealis_version(filename: str, record_names, structure: str):
             try:
                 with h5py.File(self.filename, 'r') as f:
                     records = sorted(list(f.keys()))
-                    borealis_git_hash = records.attrs['borealis_git_hash']
+                    borealis_git_hash = records.attrs['borealis_git_hash']\
                                             .decode('utf-8')
             except ValueError as err:
                 raise borealis_exceptions.BorealisStructureError(
@@ -573,7 +573,7 @@ def get_borealis_version(filename: str, record_names, structure: str):
                 with h5py.File(self.filename, 'r') as f:
                     records = sorted(list(f.keys()))
                     first_rec = f[records[0]]
-                    borealis_git_hash = first_rec.attrs['borealis_git_hash']
+                    borealis_git_hash = first_rec.attrs['borealis_git_hash']\
                                             .decode('utf-8')
             except ValueError as err:
                 raise borealis_exceptions.BorealisStructureError(

From b70c9ed5a77ee904feb52802718c78e562315aea Mon Sep 17 00:00:00 2001
From: Remington Rohel <rar129@usask.ca>
Date: Tue, 21 Mar 2023 17:45:23 +0000
Subject: [PATCH 06/18] Modifying expected types based on reading in with h5py.

* Can now read in site file and convert to array format.
* Still need to fix reading/writing array format.
---
 pydarnio/borealis/base_format.py        |  4 +-
 pydarnio/borealis/borealis_array.py     |  2 +-
 pydarnio/borealis/borealis_formats.py   | 70 ++++++++++++-------------
 pydarnio/borealis/borealis_utilities.py |  9 ++--
 4 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py
index c00bc91..5f17b02 100644
--- a/pydarnio/borealis/base_format.py
+++ b/pydarnio/borealis/base_format.py
@@ -1102,7 +1102,7 @@ class methods used inside this method should be specific
                 datatype = cls.single_element_types()[field]
             else:  # field in array_dtypes
                 datatype = cls.array_dtypes()[field]
-            if datatype == np.unicode_:
+            if datatype == str:
                 # unicode type needs to be explicitly set to have
                 # multiple chars (256)
                 datatype='|U256'
@@ -1110,7 +1110,7 @@ class methods used inside this method should be specific
             # Some indices may not be filled due to dimensions that are maximum values (num_sequences, etc. can change
             # between records), so they are initialized with a known value first.
             # Initialize floating-point values to NaN, and integer values to -1.
-            if datatype is np.int64 or datatype is np.uint32:
+            if datatype is np.int64 or datatype is np.uint32 or datatype is np.uint8:
                 empty_array[:] = -1
             else:
                 empty_array[:] = np.NaN
diff --git a/pydarnio/borealis/borealis_array.py b/pydarnio/borealis/borealis_array.py
index 0ba4045..d955c17 100644
--- a/pydarnio/borealis/borealis_array.py
+++ b/pydarnio/borealis/borealis_array.py
@@ -119,7 +119,7 @@ def __init__(self, filename: str, borealis_filetype: str):
             with h5py.File(self.filename, 'r') as f:
                 full_version = f.attrs['borealis_git_hash'].decode('utf-8').split('-')[0]
                 version = '.'.join(full_version.split('.')[:2])      # vX.Y, ignore patch revision
-        except ValueError as err:
+        except KeyError as err:
             raise borealis_exceptions.BorealisStructureError(
                 ' {} Could not find the borealis_git_hash required to '
                 'determine read version (file may be site style) {}'
diff --git a/pydarnio/borealis/borealis_formats.py b/pydarnio/borealis/borealis_formats.py
index a247a85..ccc720e 100644
--- a/pydarnio/borealis/borealis_formats.py
+++ b/pydarnio/borealis/borealis_formats.py
@@ -262,19 +262,19 @@ def single_element_types(cls):
         return {
             # Identifies the version of Borealis that made this data. Necessary
             # for all versions.
-            "borealis_git_hash": np.unicode_,
+            "borealis_git_hash": str,
             # Number used to identify experiment.
             "experiment_id": np.int64,
             # Name of the experiment file.
-            "experiment_name": np.unicode_,
+            "experiment_name": str,
             # Comment about the whole experiment
-            "experiment_comment": np.unicode_,
+            "experiment_comment": str,
             # Additional text comment that describes the slice.
-            "slice_comment": np.unicode_,
+            "slice_comment": str,
             # Number of slices in the experiment at this integration time.
             "num_slices": np.int64,
             # Three letter radar identifier.
-            "station": np.unicode_,
+            "station": str,
             # Number of sampling periods in the integration time.
             "num_sequences": np.int64,
             # range gate separation (equivalent distance between samples), km.
@@ -286,7 +286,7 @@ def single_element_types(cls):
             # Sampling rate of the samples being written to file in Hz.
             "rx_sample_rate": np.float64,
             # Designates if the record is the first in a scan.
-            "scan_start_marker": np.bool_,
+            "scan_start_marker": np.uint8,
             # Integration time in seconds.
             "int_time": np.float32,
             # Length of the pulse in microseconds.
@@ -302,7 +302,7 @@ def single_element_types(cls):
             "freq": np.uint32,
             # str denoting C data type of the samples included in the data
             # array, such as 'complex float'.
-            "samples_data_type": np.unicode_,
+            "samples_data_type": str,
             # data normalization factor determined by the filter scaling in the
             # decimation scheme.
             "data_normalization_factor": np.float64,
@@ -672,25 +672,25 @@ def single_element_types(cls):
         return {
             # Identifies the version of Borealis that made this data. Necessary
             # for all versions.
-            "borealis_git_hash": np.unicode_,
+            "borealis_git_hash": str,
             # Number used to identify experiment.
             "experiment_id": np.int64,
             # Name of the experiment file.
-            "experiment_name": np.unicode_,
+            "experiment_name": str,
             # Comment about the whole experiment
-            "experiment_comment": np.unicode_,
+            "experiment_comment": str,
             # Additional text comment that describes the slice.
-            "slice_comment": np.unicode_,
+            "slice_comment": str,
             # Number of slices in the experiment at this integration time.
             "num_slices": np.int64,
             # Three letter radar identifier.
-            "station": np.unicode_,
+            "station": str,
             # Number of sampling periods in the integration time.
             "num_sequences": np.int64,
             # Sampling rate of the samples being written to file in Hz.
             "rx_sample_rate": np.float64,
             # Designates if the record is the first in a scan.
-            "scan_start_marker": np.bool_,
+            "scan_start_marker": np.uint8,
             # Integration time in seconds.
             "int_time": np.float32,
             # Length of the pulse in microseconds.
@@ -706,7 +706,7 @@ def single_element_types(cls):
             "freq": np.uint32,
             # str denoting C data type of the samples included in the data
             # array, such as 'complex float'.
-            "samples_data_type": np.unicode_,
+            "samples_data_type": str,
             # Number of samples in the sampling period.
             "num_samps": np.uint32,
             # range gate separation (equivalent distance between samples), km
@@ -1074,25 +1074,25 @@ def single_element_types(cls):
         return {
             # Identifies the version of Borealis that made this data. Necessary
             # for all versions.
-            "borealis_git_hash": np.unicode_,
+            "borealis_git_hash": str,
             # Number used to identify experiment.
             "experiment_id": np.int64,
             # Name of the experiment file.
-            "experiment_name": np.unicode_,
+            "experiment_name": str,
             # Comment about the whole experiment
-            "experiment_comment": np.unicode_,
+            "experiment_comment": str,
             # Additional text comment that describes the slice.
-            "slice_comment": np.unicode_,
+            "slice_comment": str,
             # Number of slices in the experiment at this integration time.
             "num_slices": np.int64,
             # Three letter radar identifier.
-            "station": np.unicode_,
+            "station": str,
             # Number of sampling periods in the integration time.
             "num_sequences": np.int64,
             # Sampling rate of the samples being written to file in Hz.
             "rx_sample_rate": np.float64,
             # Designates if the record is the first in a scan.
-            "scan_start_marker": np.bool_,
+            "scan_start_marker": np.uint8,
             # Integration time in seconds.
             "int_time": np.float32,
             # Length of the pulse in microseconds.
@@ -1108,7 +1108,7 @@ def single_element_types(cls):
             "freq": np.uint32,
             # str denoting C data type of the samples included in the data
             # array, such as 'complex float'.
-            "samples_data_type": np.unicode_,
+            "samples_data_type": str,
             # Number of samples in the sampling period.
             "num_samps": np.uint32,
             # data normalization factor determined by the filter scaling in the
@@ -1402,23 +1402,23 @@ def single_element_types(cls):
         return {
             # Identifies the version of Borealis that made this data. Necessary
             # for all versions.
-            "borealis_git_hash": np.unicode_,
+            "borealis_git_hash": str,
             # Number used to identify experiment.
             "experiment_id": np.int64,
             # Name of the experiment file.
-            "experiment_name": np.unicode_,
+            "experiment_name": str,
             # Comment about the whole experiment
-            "experiment_comment": np.unicode_,
+            "experiment_comment": str,
             # Number of slices in the experiment at this integration time.
             "num_slices": np.int64,
             # Three letter radar identifier.
-            "station": np.unicode_,
+            "station": str,
             # Number of sampling periods in the integration time.
             "num_sequences": np.int64,
             # Sampling rate of the samples being written to file in Hz.
             "rx_sample_rate": np.float64,
             # Designates if the record is the first in a scan.
-            "scan_start_marker": np.bool_,
+            "scan_start_marker": np.uint8,
             # Integration time in seconds.
             "int_time": np.float32,
             # Number of main array antennas.
@@ -1427,7 +1427,7 @@ def single_element_types(cls):
             "intf_antenna_count": np.uint32,
             # str denoting C data type of the samples included in the data
             # array, such as 'complex float'.
-            "samples_data_type": np.unicode_,
+            "samples_data_type": str,
             # The center frequency of this data in kHz
             "rx_center_freq": np.float64,
             # Number of samples in the sampling period.
@@ -1513,12 +1513,12 @@ def single_element_types(cls):
             # the slice id of the file and dataset.
             "slice_id": np.uint32,
             # the interfacing of this slice to other slices.
-            "slice_interfacing": np.unicode_,
+            "slice_interfacing": str,
             # A string describing the type of scheduling time at the time of
             # this dataset.
-            "scheduling_mode": np.unicode_,
+            "scheduling_mode": str,
             # A string describing the averaging method, ex. mean, median
-            "averaging_method": np.unicode_,
+            "averaging_method": str,
             # number of blanked samples in the sequence.
             "num_blanked_samples": np.uint32
             })
@@ -1663,10 +1663,10 @@ def single_element_types(cls):
             # the slice id of the file and dataset.
             "slice_id": np.uint32,
             # the interfacing of this slice to other slices.
-            "slice_interfacing": np.unicode_,
+            "slice_interfacing": str,
             # A string describing the type of scheduling time at the time of
             # this dataset.
-            "scheduling_mode": np.unicode_,
+            "scheduling_mode": str,
             # number of blanked samples in the sequence.
             "num_blanked_samples": np.uint32
             })
@@ -1806,10 +1806,10 @@ def single_element_types(cls):
             # the slice id of the file and dataset.
             "slice_id": np.uint32,
             # the interfacing of this slice to other slices.
-            "slice_interfacing": np.unicode_,
+            "slice_interfacing": str,
             # A string describing the type of scheduling time at the time of
             # this dataset.
-            "scheduling_mode": np.unicode_,
+            "scheduling_mode": str,
             # number of blanked samples in the sequence.
             "num_blanked_samples": np.uint32
             })
@@ -1955,7 +1955,7 @@ def single_element_types(cls):
         single_element_types.update({
             # A string describing the type of scheduling time at the time of
             # this dataset.
-            "scheduling_mode": np.unicode_
+            "scheduling_mode": str
             })
         return single_element_types
 
diff --git a/pydarnio/borealis/borealis_utilities.py b/pydarnio/borealis/borealis_utilities.py
index 3ac1e52..b6abc16 100644
--- a/pydarnio/borealis/borealis_utilities.py
+++ b/pydarnio/borealis/borealis_utilities.py
@@ -262,7 +262,8 @@ def record_incorrect_types_check(filename: str, attributes_type_dict: dict,
         incorrect_types_check = {param: str(attributes_type_dict[param])
                                  for param in attributes_type_dict.keys()
                                  if type(record[param]) !=
-                                 attributes_type_dict[param]}
+                                 attributes_type_dict[param] and
+                                 record[param].shape is not None}
 
         incorrect_types_check.update({param: 'np.ndarray of ' +
                                       str(datasets_type_dict[param])
@@ -321,7 +322,8 @@ def array_incorrect_types_check(filename: str, attributes_type_dict: dict,
         incorrect_types_check = {param: str(attributes_type_dict[param])
                                  for param in attributes_type_dict.keys()
                                  if type(file_data[param]) !=
-                                 attributes_type_dict[param]}
+                                 attributes_type_dict[param] and
+                                 file_data[param].shape is not None}
 
         datasets_type_dict_keys = sorted(list(datasets_type_dict.keys()))
         np_array_types = [isinstance(file_data[param], np.ndarray) for param in
@@ -342,7 +344,8 @@ def array_incorrect_types_check(filename: str, attributes_type_dict: dict,
                                       str(datasets_type_dict[param])
                                       for param in datasets_type_dict.keys()
                                       if file_data[param].dtype.type !=
-                                      datasets_type_dict[param]})
+                                      datasets_type_dict[param] and
+                                      file_data[param].dtype.type != np.str_})
         if len(incorrect_types_check) > 0:
             raise borealis_exceptions.\
                     BorealisDataFormatTypeError(filename,

From 6fafb57898ce6be5c06d1163c75023a06eecb720 Mon Sep 17 00:00:00 2001
From: Remington Rohel <rar129@usask.ca>
Date: Tue, 21 Mar 2023 19:34:05 +0000
Subject: [PATCH 07/18] Fixed bugs converting from site to array structures.

* Bools stored as uint8
* Reading in arrays of strings requires some care
---
 pydarnio/borealis/borealis_array.py       | 12 +++++++++++-
 pydarnio/borealis/borealis_formats.py     |  8 ++++----
 pydarnio/borealis/borealis_restructure.py | 22 ++++++++++++++++------
 pydarnio/borealis/borealis_utilities.py   | 10 ++++------
 4 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/pydarnio/borealis/borealis_array.py b/pydarnio/borealis/borealis_array.py
index d955c17..8b3719e 100644
--- a/pydarnio/borealis/borealis_array.py
+++ b/pydarnio/borealis/borealis_array.py
@@ -40,6 +40,7 @@
 import deepdish as dd
 import h5py
 import logging
+import numpy as np
 
 from typing import List
 
@@ -525,4 +526,13 @@ def _write_borealis_arrays(self, attribute_types: dict,
                                        attribute_types, dataset_types,
                                        unshared_fields)
         with h5py.File(self.filename, 'w') as f:
-                f.create_dataset(self.arrays, compression=self.compression)
+            for k, v in self.arrays.items():
+                if k in attribute_types:
+                    f.attrs[k] = v
+                elif v.dtype.type == np.str_:
+                    itemsize = v.dtype.itemsize // 4    # every character is 4 bytes
+                    dset = f.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=self.compression)
+                    dset.attrs['strtype'] = b'unicode'
+                    dset.attrs['itemsize'] = itemsize
+                else:
+                    f.create_dataset(k, data=v, compression=self.compression)
diff --git a/pydarnio/borealis/borealis_formats.py b/pydarnio/borealis/borealis_formats.py
index ccc720e..e664756 100644
--- a/pydarnio/borealis/borealis_formats.py
+++ b/pydarnio/borealis/borealis_formats.py
@@ -2044,7 +2044,7 @@ def single_element_types(cls):
             "lp_status_word": np.uint32,
             # Boolean indicating if the GPS was locked during the entire
             # integration period
-            "gps_locked": np.bool_,
+            "gps_locked": np.uint8,
             # The max time diffe between GPS and system time during the
             # integration period. In seconds. Negative if GPS time ahead.
             "gps_to_system_time_diff": np.float64,
@@ -2152,7 +2152,7 @@ def single_element_types(cls):
             "lp_status_word": np.uint32,
             # Boolean indicating if the GPS was locked during the entire
             # integration period
-            "gps_locked": np.bool_,
+            "gps_locked": np.uint8,
             # The max time diffe between GPS and system time during the
             # integration period. In seconds. Negative if GPS time ahead.
             "gps_to_system_time_diff": np.float64,
@@ -2267,7 +2267,7 @@ def single_element_types(cls):
             "lp_status_word": np.uint32,
             # Boolean indicating if the GPS was locked during the entire
             # integration period
-            "gps_locked": np.bool_,
+            "gps_locked": np.uint8,
             # The max time diffe between GPS and system time during the
             # integration period. In seconds. Negative if GPS time ahead.
             "gps_to_system_time_diff": np.float64,
@@ -2374,7 +2374,7 @@ def single_element_types(cls):
             "lp_status_word": np.uint32,
             # Boolean indicating if the GPS was locked during the entire
             # integration period
-            "gps_locked": np.bool_,
+            "gps_locked": np.uint8,
             # The max time diffe between GPS and system time during the
             # integration period. In seconds. Negative if GPS time ahead.
             "gps_to_system_time_diff": np.float64,
diff --git a/pydarnio/borealis/borealis_restructure.py b/pydarnio/borealis/borealis_restructure.py
index 54204e2..fb583f8 100755
--- a/pydarnio/borealis/borealis_restructure.py
+++ b/pydarnio/borealis/borealis_restructure.py
@@ -362,8 +362,9 @@ def _site_to_array_restructure(self):
                                 else:
                                     raise TypeError(f'Field {field} has unrecognized data: {value}')
                             elif field in self.format.array_string_fields():
-                                # h5py reads numpy string arrays as contiguous unsigned ints, so we need deepdish here
-                                new_data_dict[field] = f[record_name][field]
+                                dset = f[record_name][field]
+                                itemsize = dset.attrs['itemsize']
+                                new_data_dict[field] = dset[:].view(dtype=(np.unicode_, itemsize))
                             else:
                                 raise TypeError(f'Field {field} unrecognized')
 
@@ -377,7 +378,7 @@ def _site_to_array_restructure(self):
                                 # Initialize array now with correct data type.
                                 dtype = self.format.single_element_types()[field]
                                 new_data_dict[field] = np.empty(num_records, dtype=dtype)
-                                if dtype is np.int64 or dtype is np.uint32:
+                                if dtype is np.int64 or dtype is np.uint32 or dtype is np.uint8:
                                     new_data_dict[field][:] = -1
                                 else:
                                     new_data_dict[field][:] = np.NaN
@@ -399,7 +400,7 @@ def _site_to_array_restructure(self):
                                 datatype = self.format.single_element_types()[field]
                             else:  # field in array_dtypes
                                 datatype = self.format.array_dtypes()[field]
-                            if datatype == np.unicode_:
+                            if datatype == str:
                                 # unicode type needs to be explicitly set to
                                 # have multiple chars (256)
                                 datatype = '|U256'
@@ -409,7 +410,7 @@ def _site_to_array_restructure(self):
                             # change between records), so they are initialized
                             # with a known value first. Initialize floating-
                             # point values to NaN, and integer values to -1.
-                            if datatype is np.int64 or datatype is np.uint32:
+                            if datatype is np.int64 or datatype is np.uint32 or datatype is np.uint8:
                                 empty_array[:] = -1
                             else:
                                 empty_array[:] = np.NaN
@@ -440,7 +441,16 @@ def _site_to_array_restructure(self):
                                            attribute_types, dataset_types,
                                            unshared_fields)
             with h5py.File(self.outfile_name, 'w') as f:
-                f.create_dataset(new_data_dict, compression=self.compression)
+                for k, v in new_data_dict.items():
+                    if k in attribute_types:
+                        f.attrs[k] = v
+                    elif v.dtype.type == np.str_:
+                        itemsize = v.dtype.itemsize // 4  # every character is 4 bytes
+                        dset = f.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=self.compression)
+                        dset.attrs['strtype'] = b'unicode'
+                        dset.attrs['itemsize'] = itemsize
+                    else:
+                        f.create_dataset(k, data=v, compression=self.compression)
 
         except TypeError as err:
             raise borealis_exceptions.BorealisRestructureError(
diff --git a/pydarnio/borealis/borealis_utilities.py b/pydarnio/borealis/borealis_utilities.py
index b6abc16..bc2e636 100644
--- a/pydarnio/borealis/borealis_utilities.py
+++ b/pydarnio/borealis/borealis_utilities.py
@@ -563,22 +563,20 @@ def get_borealis_version(filename: str, record_names, structure: str):
         if structure == 'array':
             try:
                 with h5py.File(self.filename, 'r') as f:
-                    records = sorted(list(f.keys()))
-                    borealis_git_hash = records.attrs['borealis_git_hash']\
-                                            .decode('utf-8')
-            except ValueError as err:
+                    borealis_git_hash = f.attrs['borealis_git_hash'].decode('utf-8')
+            except KeyError as err:
                 raise borealis_exceptions.BorealisStructureError(
                     ' {} Could not find the borealis_git_hash required to '
                     'determine file version. Data file may be corrupted. {}'
                     ''.format(filename, err)) from err
         elif structure == 'site':
             try:
-                with h5py.File(self.filename, 'r') as f:
+                with h5py.File(filename, 'r') as f:
                     records = sorted(list(f.keys()))
                     first_rec = f[records[0]]
                     borealis_git_hash = first_rec.attrs['borealis_git_hash']\
                                             .decode('utf-8')
-            except ValueError as err:
+            except KeyError as err:
                 raise borealis_exceptions.BorealisStructureError(
                     ' {} Could not find the borealis_git_hash required to '
                     'determine file version. Data file may be corrupted. {}'

From 9ee37fb51e2c3a080ec8de573418dc121470df38 Mon Sep 17 00:00:00 2001
From: Remington Rohel <rar129@usask.ca>
Date: Tue, 21 Mar 2023 20:40:46 +0000
Subject: [PATCH 08/18] Fixed all the rest of the bugs.

* Can now read/write array structured files.
* Can also restructure freely between site and array files, both with BorealisRestructure and BorealisRead plus .records or .arrays
---
 pydarnio/borealis/base_format.py          |  53 +++++++++
 pydarnio/borealis/borealis_array.py       |  50 +-------
 pydarnio/borealis/borealis_restructure.py | 138 +++++++++++-----------
 pydarnio/borealis/borealis_site.py        |  41 +++----
 pydarnio/borealis/borealis_utilities.py   |   2 +-
 5 files changed, 146 insertions(+), 138 deletions(-)

diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py
index 5f17b02..b5d23bb 100644
--- a/pydarnio/borealis/base_format.py
+++ b/pydarnio/borealis/base_format.py
@@ -1286,6 +1286,59 @@ class methods used inside this method should be specific
 
         return records
 
+    @classmethod
+    def _read_borealis_arrays(cls, filename: str) -> OrderedDict:
+        """
+        Base function for reading in a Borealis array file.
+
+        Parameters
+        ----------
+        filename: str
+            Name of the file to load arrays from
+
+        Returns
+        -------
+        OrderedDict
+            a dict of arrays loaded from an hdf5 Borealis array file
+
+        Raises
+        ------
+        OSError: file does not exist
+
+        Notes
+        -----
+        The results will differ based on the format class, as many of the
+        class methods used inside this method should be specific
+        to the format and updated in the child class.
+        """
+        arrays = OrderedDict()
+        with h5py.File(filename, 'r') as f:
+
+            # Get the datasets (vector fields)
+            array_names = sorted(list(f.keys()))
+            for array_name in array_names:
+                dset = f[array_name]
+                if 'strtype' in dset.attrs:  # string type, requires some handling
+                    itemsize = dset.attrs['itemsize']
+                    data = dset[:].view(dtype=(np.unicode_, itemsize))
+                else:
+                    data = dset[:]  # non-string, can simply load
+                arrays[array_name] = data
+
+            # Get the attributes (scalar fields)
+            attribute_dict = {k: v for k, v in f.attrs.items()}
+            attribute_dict.pop('CLASS')                     # Inherent to HDF5 file
+            attribute_dict.pop('TITLE')                     # Inherent to HDF5 file
+            attribute_dict.pop('VERSION')                   # Inherent to HDF5 file
+            attribute_dict.pop('DEEPDISH_IO_VERSION')       # Inherent to HDF5 file
+            attribute_dict.pop('PYTABLES_FORMAT_VERSION')   # Inherent to HDF5 file
+            for k, v in attribute_dict.items():
+                if isinstance(v, bytes):
+                    attribute_dict[k] = v.tobytes().decode('utf-8')
+            arrays.update(attribute_dict)
+
+        return arrays
+
     # STATIC METHODS COMMON ACROSS FORMATS
     # i.e. common methods that can be used by multiple formats in restructuring
     # (generally these will be used in the unshared fields dims for arrays)
diff --git a/pydarnio/borealis/borealis_array.py b/pydarnio/borealis/borealis_array.py
index 8b3719e..5f193ae 100644
--- a/pydarnio/borealis/borealis_array.py
+++ b/pydarnio/borealis/borealis_array.py
@@ -244,51 +244,13 @@ def read_file(self) -> dict:
         dataset_types = self.format.array_array_dtypes()
         unshared_fields = self.format.unshared_fields()
 
-        self._read_borealis_arrays(attribute_types, dataset_types,
-                                   unshared_fields)
-        return self._arrays
-
-    def _read_borealis_arrays(self, attribute_types: dict,
-                              dataset_types: dict,
-                              unshared_fields: List[str]):
-        """
-        Read the entire file while checking all data fields.
-
-        Parameters
-        ----------
-        attribute_types: dict
-            Dictionary with the required types for the attributes in the file.
-        dataset_types: dict
-            Dictionary with the require dtypes for the numpy arrays in the
-            file.
-        unshared_fields: List[str]
-            List of fields that are not shared between the records and
-            therefore should be an array with first dimension = number of
-            records
-
-        Raises
-        ------
-        BorealisFieldMissingError - when a field is missing from the Borealis
-                                file
-        BorealisExtraFieldError - when an extra field is present in the
-                                Borealis file
-        BorealisDataFormatTypeError - when a field has the incorrect
-                                field type for the Borealis file
-        BorealisNumberOfRecordsError - when the number of records cannot
-                                be discerned from the arrays
+        arrays = self.format._read_borealis_arrays(self.filename)
+        BorealisUtilities.check_arrays(self.filename, arrays,
+                                       attribute_types, dataset_types,
+                                       unshared_fields)
+        self._arrays = arrays
 
-        See Also
-        --------
-        BorealisUtilities
-        """
-        attr_types = self.format.site_single_element_types()
-        dataset_types = self.format.site_array_dtypes()
-        records = self.format._read_borealis_records
-        with h5py.File(self.filename, 'r') as arrays:
-            BorealisUtilities.check_arrays(self.filename, arrays,
-                                           attribute_types, dataset_types,
-                                           unshared_fields)
-            self._arrays = arrays
+        return self._arrays
 
 
 class BorealisArrayWrite():
diff --git a/pydarnio/borealis/borealis_restructure.py b/pydarnio/borealis/borealis_restructure.py
index fb583f8..53a6303 100755
--- a/pydarnio/borealis/borealis_restructure.py
+++ b/pydarnio/borealis/borealis_restructure.py
@@ -34,7 +34,6 @@
 import os
 import subprocess as sp
 import warnings
-from pathlib import Path
 import h5py
 import logging
 import numpy as np
@@ -207,60 +206,64 @@ def _array_to_site_restructure(self):
         attribute_types = self.format.site_single_element_types()
         dataset_types = self.format.array_dtypes()
         try:
-            shared_fields_dict = dict()
-            # shared fields are common across records, so this is done once
-            with hdf5.File(self.infile_name, 'r') as f:
+            with h5py.File(self.infile_name, 'r') as f:
+
+                # shared fields are common across records, so this is done once
+                shared_fields_dict = dict()
                 for field in self.format.shared_fields():
-                    shared_fields_dict[field] = f[field]
+                    if field in attribute_types:
+                        data = f.attrs[field]
+                        if isinstance(data, bytes):
+                            data = str(data)
+                    elif field in self.format.array_string_fields():
+                        dset = f[field]
+                        itemsize = dset.attrs['itemsize']
+                        data = dset[:].view(dtype=(np.unicode_, itemsize))
+                    else:
+                        data = f[field][:]
+                    shared_fields_dict[field] = data
 
-            unshared_single_elements = dict()
-            # These are fields which have one element per record, so the
-            # arrays are small enough to be loaded completely into memory
-            with hdf5.File(self.infile_name, 'r') as f:
+                # These are fields which have one element per record, so the
+                # arrays are small enough to be loaded completely into memory
+                unshared_single_elements = dict()
                 for field in self.format.unshared_fields():
                     if field in self.format.single_element_types():
-                        unshared_single_elements[field] = f[field]
+                        unshared_single_elements[field] = f[field][:]
 
-            with h5py.File(self.infile_name, 'r') as f:
-                records = sorted(list(f.keys()))
-                first_rec = f[records[0]]
-                sqn_timestamps_array = first_rec.attrs['sqn_timestamps']\
-                                            .decode('utf-8')
-            for record_num, seq_timestamp in enumerate(sqn_timestamps_array):
-                # format dictionary key in the same way it is done
-                # in datawrite on site
-                seq_datetime = datetime.utcfromtimestamp(seq_timestamp[0])
-                epoch = datetime.utcfromtimestamp(0)
-                key = str(int((seq_datetime - epoch).total_seconds() * 1000))
-
-                # Make this fresh every time, to reduce memory footprint
-                record_dict = dict()
-
-                # Copy over the shared fields
-                for k, v in shared_fields_dict.items():
-                    record_dict[k] = v
-
-                # populate site specific fields using given functions
-                # that take both the arrays data and the record number
-                with h5py.File(self.infile_name, 'r') as f:
+                sqn_timestamps_array = f['sqn_timestamps'][:]
+
+                for record_num, seq_timestamp in enumerate(sqn_timestamps_array):
+                    # format dictionary key in the same way it is done
+                    # in datawrite on site
+                    seq_datetime = datetime.utcfromtimestamp(seq_timestamp[0])
+                    epoch = datetime.utcfromtimestamp(0)
+                    key = str(int((seq_datetime - epoch).total_seconds() * 1000))
+
+                    # Make this fresh every time, to reduce memory footprint
+                    record_dict = dict()
+
+                    # Copy over the shared fields
+                    for k, v in shared_fields_dict.items():
+                        record_dict[k] = v
+
+                    # populate site specific fields using given functions
+                    # that take both the arrays data and the record number
                     for field in self.format.site_specific_fields():
                         record_dict[field] = \
                             self.format.site_specific_fields_generate(
                                 )[field](f, record_num)
 
-                for field in self.format.unshared_fields():
-                    if field in self.format.single_element_types():
-                        datatype = self.format.single_element_types()[field]
-                        # field is not an array, single element per record.
-                        # unshared_field_dims_site should give empty list.
-                        record_dict[field] = \
-                            datatype(unshared_single_elements[field][
-                                         record_num])
-                    else:  # field in array_dtypes
-                        # need to get the dims correct,
-                        # not always equal to the max
-                        field_flag = False
-                        with h5py.File(self.infile_name, 'r') as f:
+                    for field in self.format.unshared_fields():
+                        if field in self.format.single_element_types():
+                            datatype = self.format.single_element_types()[field]
+                            # field is not an array, single element per record.
+                            # unshared_field_dims_site should give empty list.
+                            record_dict[field] = \
+                                datatype(unshared_single_elements[field][
+                                             record_num])
+                        else:  # field in array_dtypes
+                            # need to get the dims correct, not always equal to the max
+                            field_flag = False
                             site_dims = [dimension_function(f, record_num)
                                          for dimension_function in
                                          self.format.unshared_fields_dims_site(
@@ -281,17 +284,16 @@ def _array_to_site_restructure(self):
                             index_slice = tuple(index_slice)
                             # If there was an incorrect dimension (-1 in dims), then use deepdish to extract the field
                             if field_flag:
-                                with h5py.File(self.infile_name) as f:
-                                    record_dict[field] = f[field][index_slice]
+                                record_dict[field] = f[field][index_slice]
                             else:
                                 record_dict[field] = f[field][index_slice]
-                # Wrap in another dict to use the format method
-                record_dict = OrderedDict({key: record_dict})
-                record_dict = self.format.flatten_site_arrays(record_dict)
+                    # Wrap in another dict to use the format method
+                    record_dict = OrderedDict({key: record_dict})
+                    record_dict = self.format.flatten_site_arrays(record_dict)
 
-                # Write the single record to file
-                self._write_borealis_record(record_dict, key, attribute_types,
-                                            dataset_types)
+                    # Write the single record to file
+                    self._write_borealis_record(record_dict, key, attribute_types,
+                                                dataset_types)
         except Exception as err:
             raise borealis_exceptions.BorealisRestructureError(
                 'Records for {}: Error restructuring {} from array to site '
@@ -488,23 +490,19 @@ def _write_borealis_record(self, record: dict, record_name: str,
         --------
         BorealisUtilities
         """
-        Path(self.outfile_name).touch()
         BorealisUtilities.check_records(self.infile_name, record,
                                         attribute_types, dataset_types)
 
-        # use external h5copy utility to move new record into 2hr file.
-
-        warnings.filterwarnings("ignore")
-        # Must use temporary file to append to a file; writing entire
-        # dictionary at once also doesn't work so this is required.
-        tmp_filename = self.outfile_name + '.tmp'
-        Path(tmp_filename).touch()
-
-        with h5py.File(tmp_filename, 'w') as f:
-            f.create_dataset(record[record_name], compression=self.compression)
-        
-        cp_cmd = 'h5copy -i {newfile} -o {full_file} -s / -d {dtstr}'
-        cmd = cp_cmd.format(newfile=tmp_filename, full_file=self.outfile_name,
-                            dtstr=record_name)
-        sp.run(cmd.split())
-        os.remove(tmp_filename)
+        with h5py.File(self.outfile_name, 'a') as f:
+            for group_name, rec in record.items():
+                group = f.create_group(group_name)
+                for k, v in rec.items():
+                    if k in attribute_types:
+                        group.attrs[k] = v
+                    elif v.dtype.type == np.str_:
+                        itemsize = v.dtype.itemsize // 4  # every character is 4 bytes
+                        dset = group.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=self.compression)
+                        dset.attrs['strtype'] = b'unicode'
+                        dset.attrs['itemsize'] = itemsize
+                    else:
+                        group.create_dataset(k, data=v, compression=self.compression)
diff --git a/pydarnio/borealis/borealis_site.py b/pydarnio/borealis/borealis_site.py
index 6470ad2..2ce4ddd 100644
--- a/pydarnio/borealis/borealis_site.py
+++ b/pydarnio/borealis/borealis_site.py
@@ -40,9 +40,9 @@
 import os
 import subprocess as sp
 import warnings
+import numpy as np
 
 from collections import OrderedDict
-from pathlib2 import Path
 from typing import Union
 
 from pydarnio import borealis_exceptions, borealis_formats
@@ -125,11 +125,10 @@ def __init__(self, filename: str, borealis_filetype: str):
 
         try:
             with h5py.File(self.filename, 'r') as f:
-                records = sorted(list(f.keys()))
-                first_rec = f[records[0]]
+                first_rec = f[self._record_names[0]]
                 full_version = first_rec.attrs['borealis_git_hash'].decode('utf-8').split('-')[0]
                 version = '.'.join(full_version.split('.')[:2])      # vX.Y, ignore patch revision
-        except (IndexError, ValueError) as err:
+        except (IndexError, KeyError) as err:
             # if this is an array style file, it will raise
             # IndexError on the array.
             raise borealis_exceptions.BorealisStructureError(
@@ -491,9 +490,9 @@ def _write_borealis_records(self, attribute_types: dict,
 
         Parameters
         ----------
-        attributes_type_dict: dict
+        attributes_type: dict
             Dictionary with the required types for the attributes in the file.
-        datasets_type_dict: dict
+        datasets_type: dict
             Dictionary with the require dtypes for the numpy arrays in the
             file.
 
@@ -510,23 +509,19 @@ def _write_borealis_records(self, attribute_types: dict,
         --------
         BorealisUtilities
         """
-        Path(self.filename).touch()
         BorealisUtilities.check_records(self.filename, self.records,
                                         attribute_types, dataset_types)
 
-        # use external h5copy utility to move new record into 2hr file.
-
-        warnings.filterwarnings("ignore")
-        # Must use temporary file to append to a file; writing entire
-        # dictionary at once also doesn't work so this is required.
-        tmp_filename = self.filename + '.tmp'
-        Path(tmp_filename).touch()
-        for group_name, group_dict in self.records.items():
-            with h5py.File(tmp_filename, 'w') as f:
-                f.create_dataset(str(group_name), data=group_dict,
-                                 compression=self.compression)
-            cp_cmd = 'h5copy -i {newfile} -o {full_file} -s {dtstr} -d {dtstr}'
-            cmd = cp_cmd.format(newfile=tmp_filename, full_file=self.filename,
-                                dtstr='/'+str(group_name))
-            sp.call(cmd.split())
-            os.remove(tmp_filename)
+        with h5py.File(self.filename, 'w') as f:
+            for group_name, group_dict in self.records.items():
+                group = f.create_group(str(group_name))
+                for k, v in group_dict.items():
+                    if k in attribute_types.keys():
+                        group.attrs[k] = v
+                    elif v.dtype.type == np.str_:
+                        itemsize = v.dtype.itemsize // 4  # every character is 4 bytes
+                        dset = group.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=self.compression)
+                        dset.attrs['strtype'] = b'unicode'
+                        dset.attrs['itemsize'] = itemsize
+                    else:
+                        group.create_dataset(k, data=v, compression=self.compression)
diff --git a/pydarnio/borealis/borealis_utilities.py b/pydarnio/borealis/borealis_utilities.py
index bc2e636..bc7e8fc 100644
--- a/pydarnio/borealis/borealis_utilities.py
+++ b/pydarnio/borealis/borealis_utilities.py
@@ -562,7 +562,7 @@ def get_borealis_version(filename: str, record_names, structure: str):
         """
         if structure == 'array':
             try:
-                with h5py.File(self.filename, 'r') as f:
+                with h5py.File(filename, 'r') as f:
                     borealis_git_hash = f.attrs['borealis_git_hash'].decode('utf-8')
             except KeyError as err:
                 raise borealis_exceptions.BorealisStructureError(

From 8645560d703d7b5730250a9d3a2fd8984ae34e32 Mon Sep 17 00:00:00 2001
From: Remington Rohel <rar129@usask.ca>
Date: Wed, 22 Mar 2023 15:44:50 +0000
Subject: [PATCH 09/18] Moved writing of records and arrays into base_format.py

* Changed the method signature a bit for conciseness
* Both methods open HDF5 file with 'a' permission (read/write if exists, create otherwise). This will raise an exception if the file already has groups/datasets with the same name, which I think is worthy of erroring on.
* Changed borealis_restructure.py to also use the format writing methods.
---
 pydarnio/borealis/base_format.py          | 77 ++++++++++++++++++++++-
 pydarnio/borealis/borealis_array.py       | 66 ++++---------------
 pydarnio/borealis/borealis_restructure.py | 75 +++-------------------
 pydarnio/borealis/borealis_site.py        | 57 ++++-------------
 4 files changed, 107 insertions(+), 168 deletions(-)

diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py
index b5d23bb..d131510 100644
--- a/pydarnio/borealis/base_format.py
+++ b/pydarnio/borealis/base_format.py
@@ -1230,7 +1230,7 @@ class methods used inside this method should be specific
         return timestamp_dict
 
     @classmethod
-    def _read_borealis_records(cls, filename: str) -> OrderedDict:
+    def read_records(cls, filename: str) -> OrderedDict:
         """
         Base function for reading in a Borealis site file.
 
@@ -1287,7 +1287,7 @@ class methods used inside this method should be specific
         return records
 
     @classmethod
-    def _read_borealis_arrays(cls, filename: str) -> OrderedDict:
+    def read_arrays(cls, filename: str) -> OrderedDict:
         """
         Base function for reading in a Borealis array file.
 
@@ -1339,6 +1339,79 @@ class methods used inside this method should be specific
 
         return arrays
 
+    @classmethod
+    def write_records(cls, filename: str, records: OrderedDict, attribute_types: dict,
+                      dataset_types: dict, compression: str):
+        """
+        Write the file in site style after checking records.
+
+        Several Borealis field checks are done to insure the integrity of the
+        file.
+
+        Parameters
+        ----------
+        filename: str
+            Name of the file to write to.
+        records: OrderedDict
+            Dictionary containing site-formatted fields to write to file.
+        attribute_types: dict
+            Dictionary with the required types for the attributes in the file.
+        dataset_types: dict
+            Dictionary with the require dtypes for the numpy arrays in the
+            file.
+        compression: str
+            Type of compression to use for the HDF5 file.
+        """
+        with h5py.File(filename, 'a') as f:
+            for group_name, group_dict in records.items():
+                group = f.create_group(str(group_name))
+                for k, v in group_dict.items():
+                    if k in attribute_types.keys():
+                        group.attrs[k] = v
+                    elif v.dtype.type == np.str_:
+                        itemsize = v.dtype.itemsize // 4  # every character is 4 bytes
+                        dset = group.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=compression)
+                        dset.attrs['strtype'] = b'unicode'
+                        dset.attrs['itemsize'] = itemsize
+                    else:
+                        group.create_dataset(k, data=v, compression=compression)
+
+    @classmethod
+    def write_arrays(cls, filename: str, arrays: OrderedDict, attribute_types: dict,
+                     dataset_types: dict, unshared_fields: List[str], compression: str):
+        """
+        Write arrays to file while checking all data fields.
+
+        Parameters
+        ----------
+        filename: str
+            Name of the file to write to.
+        arrays: OrderedDict
+            Dictionary containing array-formatted fields to write to file.
+        attribute_types: dict
+            Dictionary with the required types for the attributes in the file.
+        dataset_types: dict
+            Dictionary with the require dtypes for the numpy arrays in the
+            file.
+        unshared_fields: List[str]
+            List of fields that are not shared between the records and
+            therefore should be an array with first dimension = number of
+            records
+        compression: str
+            Type of compression to use for the HDF5 file.
+        """
+        with h5py.File(filename, 'a') as f:
+            for k, v in arrays.items():
+                if k in attribute_types:
+                    f.attrs[k] = v
+                elif v.dtype.type == np.str_:
+                    itemsize = v.dtype.itemsize // 4  # every character is 4 bytes
+                    dset = f.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=compression)
+                    dset.attrs['strtype'] = b'unicode'
+                    dset.attrs['itemsize'] = itemsize
+                else:
+                    f.create_dataset(k, data=v, compression=compression)
+
     # STATIC METHODS COMMON ACROSS FORMATS
     # i.e. common methods that can be used by multiple formats in restructuring
     # (generally these will be used in the unshared fields dims for arrays)
diff --git a/pydarnio/borealis/borealis_array.py b/pydarnio/borealis/borealis_array.py
index 5f193ae..e6665b6 100644
--- a/pydarnio/borealis/borealis_array.py
+++ b/pydarnio/borealis/borealis_array.py
@@ -244,7 +244,7 @@ def read_file(self) -> dict:
         dataset_types = self.format.array_array_dtypes()
         unshared_fields = self.format.unshared_fields()
 
-        arrays = self.format._read_borealis_arrays(self.filename)
+        arrays = self.format.read_arrays(self.filename)
         BorealisUtilities.check_arrays(self.filename, arrays,
                                        attribute_types, dataset_types,
                                        unshared_fields)
@@ -432,7 +432,14 @@ def write_file(self) -> str:
 
         Raises
         ------
-        BorealisFileTypeError
+        BorealisFieldMissingError - when a field is missing from the Borealis
+                                file
+        BorealisExtraFieldError - when an extra field is present in the
+                                Borealis file
+        BorealisDataFormatTypeError - when a field has the incorrect
+                                field type for the Borealis file
+        BorealisNumberOfRecordsError - when the number of records cannot
+                                be discerned from the arrays
 
         See Also
         --------
@@ -446,55 +453,8 @@ def write_file(self) -> str:
         attribute_types = self.format.array_single_element_types()
         dataset_types = self.format.array_array_dtypes()
         unshared_fields = self.format.unshared_fields()
-
-        self._write_borealis_arrays(attribute_types, dataset_types,
-                                    unshared_fields)
+        BorealisUtilities.check_arrays(self.filename, self.arrays, attribute_types,
+                                       dataset_types, unshared_fields)
+        self.format.write_arrays(self.filename, self.arrays, attribute_types,
+                                 dataset_types, unshared_fields, self.compression)
         return self.filename
-
-    def _write_borealis_arrays(self, attribute_types: dict,
-                               dataset_types: dict,
-                               unshared_fields: List[str]):
-        """
-        Write the entire file while checking all data fields.
-
-        Parameters
-        ----------
-        attribute_types: dict
-            Dictionary with the required types for the attributes in the file.
-        dataset_types: dict
-            Dictionary with the require dtypes for the numpy arrays in the
-            file.
-        unshared_fields: List[str]
-            List of fields that are not shared between the records and
-            therefore should be an array with first dimension = number of
-            records
-
-        Raises
-        ------
-        BorealisFieldMissingError - when a field is missing from the Borealis
-                                file
-        BorealisExtraFieldError - when an extra field is present in the
-                                Borealis file
-        BorealisDataFormatTypeError - when a field has the incorrect
-                                field type for the Borealis file
-        BorealisNumberOfRecordsError - when the number of records cannot
-                                be discerned from the arrays
-
-        See Also
-        --------
-        BorealisUtilities
-        """
-        BorealisUtilities.check_arrays(self.filename, self.arrays,
-                                       attribute_types, dataset_types,
-                                       unshared_fields)
-        with h5py.File(self.filename, 'w') as f:
-            for k, v in self.arrays.items():
-                if k in attribute_types:
-                    f.attrs[k] = v
-                elif v.dtype.type == np.str_:
-                    itemsize = v.dtype.itemsize // 4    # every character is 4 bytes
-                    dset = f.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=self.compression)
-                    dset.attrs['strtype'] = b'unicode'
-                    dset.attrs['itemsize'] = itemsize
-                else:
-                    f.create_dataset(k, data=v, compression=self.compression)
diff --git a/pydarnio/borealis/borealis_restructure.py b/pydarnio/borealis/borealis_restructure.py
index 53a6303..7514bfe 100755
--- a/pydarnio/borealis/borealis_restructure.py
+++ b/pydarnio/borealis/borealis_restructure.py
@@ -282,18 +282,16 @@ def _array_to_site_restructure(self):
                             index_slice = [slice(0, i) for i in site_dims if i != -1]
                             index_slice.insert(0, record_num)
                             index_slice = tuple(index_slice)
-                            # If there was an incorrect dimension (-1 in dims), then use deepdish to extract the field
-                            if field_flag:
-                                record_dict[field] = f[field][index_slice]
-                            else:
-                                record_dict[field] = f[field][index_slice]
+                            record_dict[field] = f[field][index_slice]
+
                     # Wrap in another dict to use the format method
                     record_dict = OrderedDict({key: record_dict})
                     record_dict = self.format.flatten_site_arrays(record_dict)
+                    BorealisUtilities.check_records(self.infile_name, record_dict, attribute_types, dataset_types)
 
                     # Write the single record to file
-                    self._write_borealis_record(record_dict, key, attribute_types,
-                                                dataset_types)
+                    self.format.write_records(self.outfile_name, record_dict, attribute_types, dataset_types,
+                                              self.compression)
         except Exception as err:
             raise borealis_exceptions.BorealisRestructureError(
                 'Records for {}: Error restructuring {} from array to site '
@@ -439,70 +437,13 @@ def _site_to_array_restructure(self):
             attribute_types = self.format.array_single_element_types()
             dataset_types = self.format.array_array_dtypes()
             unshared_fields = self.format.unshared_fields()
-            BorealisUtilities.check_arrays(self.infile_name, new_data_dict,
-                                           attribute_types, dataset_types,
+            BorealisUtilities.check_arrays(self.infile_name, new_data_dict, attribute_types, dataset_types,
                                            unshared_fields)
-            with h5py.File(self.outfile_name, 'w') as f:
-                for k, v in new_data_dict.items():
-                    if k in attribute_types:
-                        f.attrs[k] = v
-                    elif v.dtype.type == np.str_:
-                        itemsize = v.dtype.itemsize // 4  # every character is 4 bytes
-                        dset = f.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=self.compression)
-                        dset.attrs['strtype'] = b'unicode'
-                        dset.attrs['itemsize'] = itemsize
-                    else:
-                        f.create_dataset(k, data=v, compression=self.compression)
+            self.format.write_arrays(self.outfile_name, new_data_dict, attribute_types, dataset_types, unshared_fields,
+                                     self.compression)
 
         except TypeError as err:
             raise borealis_exceptions.BorealisRestructureError(
                 'Records for {}: Error restructuring {} from site to array '
                 'style: {}'.format(self.infile_name, self.format.__name__, err)
             ) from err
-
-    def _write_borealis_record(self, record: dict, record_name: str,
-                               attribute_types: dict, dataset_types: dict):
-        """
-        Add a record to the output file in site style after checking the record.
-
-        Several Borealis field checks are done to insure the integrity of the
-        record.
-
-        Parameters
-        ----------
-        record: dict
-            Dictionary containing the site-structured record.
-        record_name: str
-            Group name of the record for the HDF5 hierarchy.
-        attribute_types: dict
-            Dictionary with the required types for the attributes in the file.
-        dataset_types: dict
-            Dictionary with the required dtypes for the numpy arrays in the
-            file.
-
-        Raises
-        ------
-        BorealisFieldMissingError
-        BorealisExtraFieldError
-        BorealisDataFormatTypeError
-
-        See Also
-        --------
-        BorealisUtilities
-        """
-        BorealisUtilities.check_records(self.infile_name, record,
-                                        attribute_types, dataset_types)
-
-        with h5py.File(self.outfile_name, 'a') as f:
-            for group_name, rec in record.items():
-                group = f.create_group(group_name)
-                for k, v in rec.items():
-                    if k in attribute_types:
-                        group.attrs[k] = v
-                    elif v.dtype.type == np.str_:
-                        itemsize = v.dtype.itemsize // 4  # every character is 4 bytes
-                        dset = group.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=self.compression)
-                        dset.attrs['strtype'] = b'unicode'
-                        dset.attrs['itemsize'] = itemsize
-                    else:
-                        group.create_dataset(k, data=v, compression=self.compression)
diff --git a/pydarnio/borealis/borealis_site.py b/pydarnio/borealis/borealis_site.py
index 2ce4ddd..473ae4a 100644
--- a/pydarnio/borealis/borealis_site.py
+++ b/pydarnio/borealis/borealis_site.py
@@ -267,7 +267,7 @@ def read_file(self) -> dict:
         attribute_types = self.format.site_single_element_types()
         dataset_types = self.format.site_array_dtypes()
 
-        records = self.format._read_borealis_records(self.filename)
+        records = self.format.read_records(self.filename)
         BorealisUtilities.check_records(self.filename, records,
                                         attribute_types, dataset_types)
 
@@ -467,34 +467,8 @@ def write_file(self) -> str:
 
         Returns
         -------
-        filename
+        filename: str
             The filename written to.
-        """
-        pyDARNio_log.info("Writing Borealis {} {} file: {}"
-                          "".format(self.software_version,
-                                    self.borealis_filetype, self.filename))
-
-        attribute_types = self.format.site_single_element_types()
-        dataset_types = self.format.site_array_dtypes()
-
-        self._write_borealis_records(attribute_types, dataset_types)
-        return self.filename
-
-    def _write_borealis_records(self, attribute_types: dict,
-                                dataset_types: dict):
-        """
-        Write the file in site style after checking records.
-
-        Several Borealis field checks are done to insure the integrity of the
-        file.
-
-        Parameters
-        ----------
-        attributes_type: dict
-            Dictionary with the required types for the attributes in the file.
-        datasets_type: dict
-            Dictionary with the require dtypes for the numpy arrays in the
-            file.
 
         Raises
         ------
@@ -504,24 +478,15 @@ def _write_borealis_records(self, attribute_types: dict,
                                 Borealis file/stream type
         BorealisDataFormatTypeError - when a field has the incorrect
                                 field type for the Borealis file/stream type
-
-        See Also
-        --------
-        BorealisUtilities
         """
+        pyDARNio_log.info("Writing Borealis {} {} file: {}"
+                          "".format(self.software_version,
+                                    self.borealis_filetype, self.filename))
+
+        attribute_types = self.format.site_single_element_types()
+        dataset_types = self.format.site_array_dtypes()
         BorealisUtilities.check_records(self.filename, self.records,
                                         attribute_types, dataset_types)
-
-        with h5py.File(self.filename, 'w') as f:
-            for group_name, group_dict in self.records.items():
-                group = f.create_group(str(group_name))
-                for k, v in group_dict.items():
-                    if k in attribute_types.keys():
-                        group.attrs[k] = v
-                    elif v.dtype.type == np.str_:
-                        itemsize = v.dtype.itemsize // 4  # every character is 4 bytes
-                        dset = group.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=self.compression)
-                        dset.attrs['strtype'] = b'unicode'
-                        dset.attrs['itemsize'] = itemsize
-                    else:
-                        group.create_dataset(k, data=v, compression=self.compression)
+        self.format.write_records(self.filename, self.records, attribute_types,
+                                  dataset_types, self.compression)
+        return self.filename

From d4fb227b49610507ba5170701dd7485944558a04 Mon Sep 17 00:00:00 2001
From: Remington Rohel <rar129@usask.ca>
Date: Wed, 22 Mar 2023 16:28:35 +0000
Subject: [PATCH 10/18] Handle empty attributes

* experiment_comment and slice_comment fields are sometimes empty, so we need to handle them in order to convert to DMAP
---
 pydarnio/borealis/base_format.py | 40 +++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py
index d131510..494496f 100644
--- a/pydarnio/borealis/base_format.py
+++ b/pydarnio/borealis/base_format.py
@@ -1273,13 +1273,20 @@ class methods used inside this method should be specific
                     rec_dict[dset_name] = data
 
                 # Get the attributes (scalar fields)
-                attribute_dict = {k: v for k, v in group.attrs.items()}
-                attribute_dict.pop('CLASS')       # Inherent to HDF5 file
-                attribute_dict.pop('TITLE')       # Inherent to HDF5 file
-                attribute_dict.pop('VERSION')     # Inherent to HDF5 file
-                for k, v in attribute_dict.items():
-                    if isinstance(v, bytes):
+                attribute_dict = {}
+                for k, v in group.attrs.items():
+                    if k in ['CLASS', 'TITLE', 'VERSION']:
+                        continue
+                    elif isinstance(v, bytes):
                         attribute_dict[k] = v.tobytes().decode('utf-8')
+                    elif isinstance(v, h5py.Empty):
+                        dtype = v.dtype.type
+                        data = dtype()
+                        if isinstance(data, bytes):
+                            data = data.decode('utf-8')
+                        attribute_dict[k] = data
+                    else:
+                        attribute_dict[k] = v
                 rec_dict.update(attribute_dict)
 
                 records[rec_key] = rec_dict
@@ -1326,15 +1333,20 @@ class methods used inside this method should be specific
                 arrays[array_name] = data
 
             # Get the attributes (scalar fields)
-            attribute_dict = {k: v for k, v in f.attrs.items()}
-            attribute_dict.pop('CLASS')                     # Inherent to HDF5 file
-            attribute_dict.pop('TITLE')                     # Inherent to HDF5 file
-            attribute_dict.pop('VERSION')                   # Inherent to HDF5 file
-            attribute_dict.pop('DEEPDISH_IO_VERSION')       # Inherent to HDF5 file
-            attribute_dict.pop('PYTABLES_FORMAT_VERSION')   # Inherent to HDF5 file
-            for k, v in attribute_dict.items():
-                if isinstance(v, bytes):
+            attribute_dict = {}
+            for k, v in f.attrs.items():
+                if k in ['CLASS', 'TITLE', 'VERSION', 'DEEPDISH_IO_VERSION', 'PYTABLES_FORMAT_VERSION']:
+                    continue
+                elif isinstance(v, bytes):
                     attribute_dict[k] = v.tobytes().decode('utf-8')
+                elif isinstance(v, h5py.Empty):
+                    dtype = v.dtype.type
+                    data = dtype()
+                    if isinstance(data, bytes):
+                        data = data.decode('utf-8')
+                    attribute_dict[k] = data
+                else:
+                    attribute_dict[k] = v
             arrays.update(attribute_dict)
 
         return arrays

From ad699f1a38a9a3092cfac927e949a3f15dc8efc4 Mon Sep 17 00:00:00 2001
From: Remington Rohel <rar129@usask.ca>
Date: Wed, 22 Mar 2023 19:26:56 +0000
Subject: [PATCH 11/18] Fixed conversion to DMAP.

* Had to deal more carefully with empty string attributes, and saving string attributes to file.
---
 pydarnio/borealis/base_format.py          | 10 ++++++++--
 pydarnio/borealis/borealis_restructure.py |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py
index 494496f..1042b77 100644
--- a/pydarnio/borealis/base_format.py
+++ b/pydarnio/borealis/base_format.py
@@ -1379,7 +1379,10 @@ def write_records(cls, filename: str, records: OrderedDict, attribute_types: dic
                 group = f.create_group(str(group_name))
                 for k, v in group_dict.items():
                     if k in attribute_types.keys():
-                        group.attrs[k] = v
+                        if isinstance(v, str):
+                            group.attrs[k] = np.bytes_(v)
+                        else:
+                            group.attrs[k] = v
                     elif v.dtype.type == np.str_:
                         itemsize = v.dtype.itemsize // 4  # every character is 4 bytes
                         dset = group.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=compression)
@@ -1415,7 +1418,10 @@ def write_arrays(cls, filename: str, arrays: OrderedDict, attribute_types: dict,
         with h5py.File(filename, 'a') as f:
             for k, v in arrays.items():
                 if k in attribute_types:
-                    f.attrs[k] = v
+                    if isinstance(v, str):
+                        f.attrs[k] = np.bytes_(v)
+                    else:
+                        f.attrs[k] = v
                 elif v.dtype.type == np.str_:
                     itemsize = v.dtype.itemsize // 4  # every character is 4 bytes
                     dset = f.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=compression)
diff --git a/pydarnio/borealis/borealis_restructure.py b/pydarnio/borealis/borealis_restructure.py
index 7514bfe..a3cfc3c 100755
--- a/pydarnio/borealis/borealis_restructure.py
+++ b/pydarnio/borealis/borealis_restructure.py
@@ -214,7 +214,7 @@ def _array_to_site_restructure(self):
                     if field in attribute_types:
                         data = f.attrs[field]
                         if isinstance(data, bytes):
-                            data = str(data)
+                            data = data.decode('utf-8')
                     elif field in self.format.array_string_fields():
                         dset = f[field]
                         itemsize = dset.attrs['itemsize']

From 8762d226d3a02b1b7232faa65fda895b0398d737 Mon Sep 17 00:00:00 2001
From: RemingtonRohel <77300402+RemingtonRohel@users.noreply.github.com>
Date: Thu, 23 Mar 2023 14:13:58 +0000
Subject: [PATCH 12/18] Remove deepdish dependency from setup.py

Co-authored-by: Adam Lozinsky <arl203@mail.usask.ca>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1af436b..7f1901d 100644
--- a/setup.py
+++ b/setup.py
@@ -49,7 +49,7 @@
     author="SuperDARN",
     include_package_data=True,
     setup_requires=['pyyaml', 'numpy',
-                    'h5py>=3.3.0', 'deepdish', 'pathlib2'],
+                    'h5py>=3.3.0', 'pathlib2'],
     # pyyaml library install
     install_requires=['pyyaml', 'numpy',
                       'h5py>=3.3.0', 'deepdish', 'pathlib2']

From 3787aa9ea406a428b5d2fed62702f3f3e97a8973 Mon Sep 17 00:00:00 2001
From: RemingtonRohel <77300402+RemingtonRohel@users.noreply.github.com>
Date: Thu, 23 Mar 2023 14:14:37 +0000
Subject: [PATCH 13/18] Minor typo in docstring

Co-authored-by: Theodore Kolkman <90067549+tjk584@users.noreply.github.com>
---
 pydarnio/borealis/base_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py
index 1042b77..8f4a2a5 100644
--- a/pydarnio/borealis/base_format.py
+++ b/pydarnio/borealis/base_format.py
@@ -1357,7 +1357,7 @@ def write_records(cls, filename: str, records: OrderedDict, attribute_types: dic
         """
         Write the file in site style after checking records.
 
-        Several Borealis field checks are done to insure the integrity of the
+        Several Borealis field checks are done to ensure the integrity of the
         file.
 
         Parameters

From 7f0ddd2f233221992df25cd07b79c4b9bc407953 Mon Sep 17 00:00:00 2001
From: RemingtonRohel <77300402+RemingtonRohel@users.noreply.github.com>
Date: Thu, 23 Mar 2023 14:15:26 +0000
Subject: [PATCH 14/18] Remove deepdish import

Co-authored-by: Theodore Kolkman <90067549+tjk584@users.noreply.github.com>
---
 pydarnio/borealis/borealis_array.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pydarnio/borealis/borealis_array.py b/pydarnio/borealis/borealis_array.py
index e6665b6..8dbcf9c 100644
--- a/pydarnio/borealis/borealis_array.py
+++ b/pydarnio/borealis/borealis_array.py
@@ -37,7 +37,6 @@
 For more information on Borealis data files and how they convert to SDarn
 files, see: https://borealis.readthedocs.io/en/latest/
 """
-import deepdish as dd
 import h5py
 import logging
 import numpy as np

From f6df969417b4036204fed08546a06ae752a8cdb2 Mon Sep 17 00:00:00 2001
From: RemingtonRohel <77300402+RemingtonRohel@users.noreply.github.com>
Date: Thu, 23 Mar 2023 15:54:28 +0000
Subject: [PATCH 15/18] Brevity in type checking

Co-authored-by: Adam Lozinsky <arl203@mail.usask.ca>
---
 pydarnio/borealis/base_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py
index 8f4a2a5..4244edb 100644
--- a/pydarnio/borealis/base_format.py
+++ b/pydarnio/borealis/base_format.py
@@ -1110,7 +1110,7 @@ class methods used inside this method should be specific
             # Some indices may not be filled due to dimensions that are maximum values (num_sequences, etc. can change
             # between records), so they are initialized with a known value first.
             # Initialize floating-point values to NaN, and integer values to -1.
-            if datatype is np.int64 or datatype is np.uint32 or datatype is np.uint8:
+            if datatype in [np.int64, np.uint32, np.uint8]:
                 empty_array[:] = -1
             else:
                 empty_array[:] = np.NaN

From eab4a6c8281465d5c181e752220b149c206372ab Mon Sep 17 00:00:00 2001
From: RemingtonRohel <77300402+RemingtonRohel@users.noreply.github.com>
Date: Thu, 23 Mar 2023 15:54:41 +0000
Subject: [PATCH 16/18] Brevity in type checking

Co-authored-by: Adam Lozinsky <arl203@mail.usask.ca>
---
 pydarnio/borealis/borealis_restructure.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydarnio/borealis/borealis_restructure.py b/pydarnio/borealis/borealis_restructure.py
index a3cfc3c..67f49aa 100755
--- a/pydarnio/borealis/borealis_restructure.py
+++ b/pydarnio/borealis/borealis_restructure.py
@@ -410,7 +410,7 @@ def _site_to_array_restructure(self):
                             # change between records), so they are initialized
                             # with a known value first. Initialize floating-
                             # point values to NaN, and integer values to -1.
-                            if datatype is np.int64 or datatype is np.uint32 or datatype is np.uint8:
+                            if datatype in [np.int64, np.uint32, np.uint8]:
                                 empty_array[:] = -1
                             else:
                                 empty_array[:] = np.NaN

From 562c8aaa7a38bef5b3a2b46f23429383ee1000f1 Mon Sep 17 00:00:00 2001
From: RemingtonRohel <77300402+RemingtonRohel@users.noreply.github.com>
Date: Thu, 23 Mar 2023 15:54:56 +0000
Subject: [PATCH 17/18] Brevity in type checking

Co-authored-by: Adam Lozinsky <arl203@mail.usask.ca>
---
 pydarnio/borealis/borealis_restructure.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydarnio/borealis/borealis_restructure.py b/pydarnio/borealis/borealis_restructure.py
index 67f49aa..536ea61 100755
--- a/pydarnio/borealis/borealis_restructure.py
+++ b/pydarnio/borealis/borealis_restructure.py
@@ -378,7 +378,7 @@ def _site_to_array_restructure(self):
                                 # Initialize array now with correct data type.
                                 dtype = self.format.single_element_types()[field]
                                 new_data_dict[field] = np.empty(num_records, dtype=dtype)
-                                if dtype is np.int64 or dtype is np.uint32 or dtype is np.uint8:
+                                if dtype in [np.int64, np.uint32, np.uint8]:
                                     new_data_dict[field][:] = -1
                                 else:
                                     new_data_dict[field][:] = np.NaN

From 8a7418f01fc0dbbcdfbd52380848070bd6e22c39 Mon Sep 17 00:00:00 2001
From: Remington Rohel <rar129@usask.ca>
Date: Thu, 23 Mar 2023 17:02:53 +0000
Subject: [PATCH 18/18] Pop other deepdish / pytables fields when reading site
 files.

---
 pydarnio/borealis/base_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py
index 4244edb..eba9bff 100644
--- a/pydarnio/borealis/base_format.py
+++ b/pydarnio/borealis/base_format.py
@@ -1275,7 +1275,7 @@ class methods used inside this method should be specific
                 # Get the attributes (scalar fields)
                 attribute_dict = {}
                 for k, v in group.attrs.items():
-                    if k in ['CLASS', 'TITLE', 'VERSION']:
+                    if k in ['CLASS', 'TITLE', 'VERSION', 'DEEPDISH_IO_VERSION', 'PYTABLES_FORMAT_VERSION']:
                         continue
                     elif isinstance(v, bytes):
                         attribute_dict[k] = v.tobytes().decode('utf-8')