From 36aa7a327885998a0c92bb361a21a2ec832afa7b Mon Sep 17 00:00:00 2001
From: Carley <60905856+carleyjmartin@users.noreply.github.com>
Date: Thu, 23 Mar 2023 11:44:49 -0600
Subject: [PATCH] DEP: Deepdish to h5py (#60)

Use h5py for all HDF5 file I/O. The deepdish package is no longer maintained, and h5py is managed by the HDF5 group so is not at risk of deprecation.
---
 .gitignore                                |   1 +
 pydarnio/borealis/base_format.py          | 207 +++++++++++++++++++++-
 pydarnio/borealis/borealis_array.py       | 106 +++--------
 pydarnio/borealis/borealis_convert.py     |   1 -
 pydarnio/borealis/borealis_formats.py     |  78 ++++----
 pydarnio/borealis/borealis_restructure.py | 194 ++++++++------------
 pydarnio/borealis/borealis_site.py        | 108 +++--------
 pydarnio/borealis/borealis_utilities.py   |  26 +--
 setup.cfg                                 |   3 +-
 setup.py                                  |  56 ++++++
 10 files changed, 435 insertions(+), 345 deletions(-)
 create mode 100644 setup.py

diff --git a/.gitignore b/.gitignore
index b6e4761..ed783b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 __pycache__/
 *.py[cod]
 *$py.class
+*.DS_Store
 
 # C extensions
 *.so
diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py
index ca8e97d..eba9bff 100644
--- a/pydarnio/borealis/base_format.py
+++ b/pydarnio/borealis/base_format.py
@@ -37,12 +37,12 @@
 """
 
 import copy
+import h5py
 import numpy as np
 
 from collections import OrderedDict
 from datetime import datetime
 from typing import Callable, List
-import h5py
 
 from pydarnio import borealis_exceptions
 
@@ -1102,7 +1102,7 @@ class methods used inside this method should be specific
                 datatype = cls.single_element_types()[field]
             else:  # field in array_dtypes
                 datatype = cls.array_dtypes()[field]
-            if datatype == np.unicode_:
+            if datatype == str:
                 # unicode type needs to be explicitly set to have
                 # multiple chars (256)
                 datatype='|U256'
@@ -1110,7 +1110,7 @@ class methods used inside this method should be specific
             # Some indices may not be filled due to dimensions that are maximum values (num_sequences, etc. can change
             # between records), so they are initialized with a known value first.
             # Initialize floating-point values to NaN, and integer values to -1.
-            if datatype is np.int64 or datatype is np.uint32:
+            if datatype in [np.int64, np.uint32, np.uint8]:
                 empty_array[:] = -1
             else:
                 empty_array[:] = np.NaN
@@ -1229,6 +1229,207 @@ class methods used inside this method should be specific
 
         return timestamp_dict
 
+    @classmethod
+    def read_records(cls, filename: str) -> OrderedDict:
+        """
+        Base function for reading in a Borealis site file.
+
+        Parameters
+        ----------
+        filename: str
+            Name of the file to load records from
+
+        Returns
+        -------
+        OrderedDict
+            a dict of timestamped records loaded from an hdf5 Borealis site file
+
+        Raises
+        ------
+        OSError: file does not exist
+
+        Notes
+        -----
+        The results will differ based on the format class, as many of the
+        class methods used inside this method should be specific
+        to the format and updated in the child class.
+        """
+        records = OrderedDict()
+        with h5py.File(filename, 'r') as f:
+            record_keys = sorted(list(f.keys()))
+            for rec_key in record_keys:
+                rec_dict = {}
+                group = f[rec_key]
+
+                # Get the datasets (vector fields)
+                datasets = list(group.keys())
+                for dset_name in datasets:
+                    dset = group[dset_name]
+                    if 'strtype' in dset.attrs:     # string type, requires some handling
+                        itemsize = dset.attrs['itemsize']
+                        data = dset[:].view(dtype=(np.unicode_, itemsize))
+                    else:
+                        data = dset[:]      # non-string, can simply load
+                    rec_dict[dset_name] = data
+
+                # Get the attributes (scalar fields)
+                attribute_dict = {}
+                for k, v in group.attrs.items():
+                    if k in ['CLASS', 'TITLE', 'VERSION', 'DEEPDISH_IO_VERSION', 'PYTABLES_FORMAT_VERSION']:
+                        continue
+                    elif isinstance(v, bytes):
+                        attribute_dict[k] = v.tobytes().decode('utf-8')
+                    elif isinstance(v, h5py.Empty):
+                        dtype = v.dtype.type
+                        data = dtype()
+                        if isinstance(data, bytes):
+                            data = data.decode('utf-8')
+                        attribute_dict[k] = data
+                    else:
+                        attribute_dict[k] = v
+                rec_dict.update(attribute_dict)
+
+                records[rec_key] = rec_dict
+
+        return records
+
+    @classmethod
+    def read_arrays(cls, filename: str) -> OrderedDict:
+        """
+        Base function for reading in a Borealis array file.
+
+        Parameters
+        ----------
+        filename: str
+            Name of the file to load arrays from
+
+        Returns
+        -------
+        OrderedDict
+            a dict of arrays loaded from an hdf5 Borealis array file
+
+        Raises
+        ------
+        OSError: file does not exist
+
+        Notes
+        -----
+        The results will differ based on the format class, as many of the
+        class methods used inside this method should be specific
+        to the format and updated in the child class.
+        """
+        arrays = OrderedDict()
+        with h5py.File(filename, 'r') as f:
+
+            # Get the datasets (vector fields)
+            array_names = sorted(list(f.keys()))
+            for array_name in array_names:
+                dset = f[array_name]
+                if 'strtype' in dset.attrs:  # string type, requires some handling
+                    itemsize = dset.attrs['itemsize']
+                    data = dset[:].view(dtype=(np.unicode_, itemsize))
+                else:
+                    data = dset[:]  # non-string, can simply load
+                arrays[array_name] = data
+
+            # Get the attributes (scalar fields)
+            attribute_dict = {}
+            for k, v in f.attrs.items():
+                if k in ['CLASS', 'TITLE', 'VERSION', 'DEEPDISH_IO_VERSION', 'PYTABLES_FORMAT_VERSION']:
+                    continue
+                elif isinstance(v, bytes):
+                    attribute_dict[k] = v.tobytes().decode('utf-8')
+                elif isinstance(v, h5py.Empty):
+                    dtype = v.dtype.type
+                    data = dtype()
+                    if isinstance(data, bytes):
+                        data = data.decode('utf-8')
+                    attribute_dict[k] = data
+                else:
+                    attribute_dict[k] = v
+            arrays.update(attribute_dict)
+
+        return arrays
+
+    @classmethod
+    def write_records(cls, filename: str, records: OrderedDict, attribute_types: dict,
+                      dataset_types: dict, compression: str):
+        """
+        Write the file in site style after checking records.
+
+        Several Borealis field checks are done to ensure the integrity of the
+        file.
+
+        Parameters
+        ----------
+        filename: str
+            Name of the file to write to.
+        records: OrderedDict
+            Dictionary containing site-formatted fields to write to file.
+        attribute_types: dict
+            Dictionary with the required types for the attributes in the file.
+        dataset_types: dict
+            Dictionary with the require dtypes for the numpy arrays in the
+            file.
+        compression: str
+            Type of compression to use for the HDF5 file.
+        """
+        with h5py.File(filename, 'a') as f:
+            for group_name, group_dict in records.items():
+                group = f.create_group(str(group_name))
+                for k, v in group_dict.items():
+                    if k in attribute_types.keys():
+                        if isinstance(v, str):
+                            group.attrs[k] = np.bytes_(v)
+                        else:
+                            group.attrs[k] = v
+                    elif v.dtype.type == np.str_:
+                        itemsize = v.dtype.itemsize // 4  # every character is 4 bytes
+                        dset = group.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=compression)
+                        dset.attrs['strtype'] = b'unicode'
+                        dset.attrs['itemsize'] = itemsize
+                    else:
+                        group.create_dataset(k, data=v, compression=compression)
+
+    @classmethod
+    def write_arrays(cls, filename: str, arrays: OrderedDict, attribute_types: dict,
+                     dataset_types: dict, unshared_fields: List[str], compression: str):
+        """
+        Write arrays to file while checking all data fields.
+
+        Parameters
+        ----------
+        filename: str
+            Name of the file to write to.
+        arrays: OrderedDict
+            Dictionary containing array-formatted fields to write to file.
+        attribute_types: dict
+            Dictionary with the required types for the attributes in the file.
+        dataset_types: dict
+            Dictionary with the require dtypes for the numpy arrays in the
+            file.
+        unshared_fields: List[str]
+            List of fields that are not shared between the records and
+            therefore should be an array with first dimension = number of
+            records
+        compression: str
+            Type of compression to use for the HDF5 file.
+        """
+        with h5py.File(filename, 'a') as f:
+            for k, v in arrays.items():
+                if k in attribute_types:
+                    if isinstance(v, str):
+                        f.attrs[k] = np.bytes_(v)
+                    else:
+                        f.attrs[k] = v
+                elif v.dtype.type == np.str_:
+                    itemsize = v.dtype.itemsize // 4  # every character is 4 bytes
+                    dset = f.create_dataset(k, data=v.view(dtype=(np.uint8)), compression=compression)
+                    dset.attrs['strtype'] = b'unicode'
+                    dset.attrs['itemsize'] = itemsize
+                else:
+                    f.create_dataset(k, data=v, compression=compression)
+
     # STATIC METHODS COMMON ACROSS FORMATS
     # i.e. common methods that can be used by multiple formats in restructuring
     # (generally these will be used in the unshared fields dims for arrays)
diff --git a/pydarnio/borealis/borealis_array.py b/pydarnio/borealis/borealis_array.py
index 4ff6afd..8dbcf9c 100644
--- a/pydarnio/borealis/borealis_array.py
+++ b/pydarnio/borealis/borealis_array.py
@@ -37,8 +37,9 @@
 For more information on Borealis data files and how they convert to SDarn
 files, see: https://borealis.readthedocs.io/en/latest/
 """
-import deepdish as dd
+import h5py
 import logging
+import numpy as np
 
 from typing import List
 
@@ -115,10 +116,10 @@ def __init__(self, filename: str, borealis_filetype: str):
         # get the version of the file - split by the dash, first part should be
         # 'vX.X'
         try:
-            version = dd.io.load(self.filename,
-                                 group='/borealis_git_hash').split('-')[0]
-            version = '.'.join(version.split('.')[:2])      # vX.Y, ignore patch revision
-        except ValueError as err:
+            with h5py.File(self.filename, 'r') as f:
+                full_version = f.attrs['borealis_git_hash'].decode('utf-8').split('-')[0]
+                version = '.'.join(full_version.split('.')[:2])      # vX.Y, ignore patch revision
+        except KeyError as err:
             raise borealis_exceptions.BorealisStructureError(
                 ' {} Could not find the borealis_git_hash required to '
                 'determine read version (file may be site style) {}'
@@ -242,49 +243,14 @@ def read_file(self) -> dict:
         dataset_types = self.format.array_array_dtypes()
         unshared_fields = self.format.unshared_fields()
 
-        self._read_borealis_arrays(attribute_types, dataset_types,
-                                   unshared_fields)
-        return self._arrays
-
-    def _read_borealis_arrays(self, attribute_types: dict,
-                              dataset_types: dict,
-                              unshared_fields: List[str]):
-        """
-        Read the entire file while checking all data fields.
-
-        Parameters
-        ----------
-        attribute_types: dict
-            Dictionary with the required types for the attributes in the file.
-        dataset_types: dict
-            Dictionary with the require dtypes for the numpy arrays in the
-            file.
-        unshared_fields: List[str]
-            List of fields that are not shared between the records and
-            therefore should be an array with first dimension = number of
-            records
-
-        Raises
-        ------
-        BorealisFieldMissingError - when a field is missing from the Borealis
-                                file
-        BorealisExtraFieldError - when an extra field is present in the
-                                Borealis file
-        BorealisDataFormatTypeError - when a field has the incorrect
-                                field type for the Borealis file
-        BorealisNumberOfRecordsError - when the number of records cannot
-                                be discerned from the arrays
-
-        See Also
-        --------
-        BorealisUtilities
-        """
-        arrays = dd.io.load(self.filename)
+        arrays = self.format.read_arrays(self.filename)
         BorealisUtilities.check_arrays(self.filename, arrays,
                                        attribute_types, dataset_types,
                                        unshared_fields)
         self._arrays = arrays
 
+        return self._arrays
+
 
 class BorealisArrayWrite():
     """
@@ -465,7 +431,14 @@ def write_file(self) -> str:
 
         Raises
         ------
-        BorealisFileTypeError
+        BorealisFieldMissingError - when a field is missing from the Borealis
+                                file
+        BorealisExtraFieldError - when an extra field is present in the
+                                Borealis file
+        BorealisDataFormatTypeError - when a field has the incorrect
+                                field type for the Borealis file
+        BorealisNumberOfRecordsError - when the number of records cannot
+                                be discerned from the arrays
 
         See Also
         --------
@@ -479,45 +452,8 @@ def write_file(self) -> str:
         attribute_types = self.format.array_single_element_types()
         dataset_types = self.format.array_array_dtypes()
         unshared_fields = self.format.unshared_fields()
-
-        self._write_borealis_arrays(attribute_types, dataset_types,
-                                    unshared_fields)
+        BorealisUtilities.check_arrays(self.filename, self.arrays, attribute_types,
+                                       dataset_types, unshared_fields)
+        self.format.write_arrays(self.filename, self.arrays, attribute_types,
+                                 dataset_types, unshared_fields, self.compression)
         return self.filename
-
-    def _write_borealis_arrays(self, attribute_types: dict,
-                               dataset_types: dict,
-                               unshared_fields: List[str]):
-        """
-        Write the entire file while checking all data fields.
-
-        Parameters
-        ----------
-        attribute_types: dict
-            Dictionary with the required types for the attributes in the file.
-        dataset_types: dict
-            Dictionary with the require dtypes for the numpy arrays in the
-            file.
-        unshared_fields: List[str]
-            List of fields that are not shared between the records and
-            therefore should be an array with first dimension = number of
-            records
-
-        Raises
-        ------
-        BorealisFieldMissingError - when a field is missing from the Borealis
-                                file
-        BorealisExtraFieldError - when an extra field is present in the
-                                Borealis file
-        BorealisDataFormatTypeError - when a field has the incorrect
-                                field type for the Borealis file
-        BorealisNumberOfRecordsError - when the number of records cannot
-                                be discerned from the arrays
-
-        See Also
-        --------
-        BorealisUtilities
-        """
-        BorealisUtilities.check_arrays(self.filename, self.arrays,
-                                       attribute_types, dataset_types,
-                                       unshared_fields)
-        dd.io.save(self.filename, self.arrays, compression=self.compression)
diff --git a/pydarnio/borealis/borealis_convert.py b/pydarnio/borealis/borealis_convert.py
index 785b129..3e8eda3 100644
--- a/pydarnio/borealis/borealis_convert.py
+++ b/pydarnio/borealis/borealis_convert.py
@@ -42,7 +42,6 @@
 """
 import logging
 import numpy as np
-import deepdish as dd
 
 from datetime import datetime
 from typing import Union
diff --git a/pydarnio/borealis/borealis_formats.py b/pydarnio/borealis/borealis_formats.py
index a247a85..e664756 100644
--- a/pydarnio/borealis/borealis_formats.py
+++ b/pydarnio/borealis/borealis_formats.py
@@ -262,19 +262,19 @@ def single_element_types(cls):
         return {
             # Identifies the version of Borealis that made this data. Necessary
             # for all versions.
-            "borealis_git_hash": np.unicode_,
+            "borealis_git_hash": str,
             # Number used to identify experiment.
             "experiment_id": np.int64,
             # Name of the experiment file.
-            "experiment_name": np.unicode_,
+            "experiment_name": str,
             # Comment about the whole experiment
-            "experiment_comment": np.unicode_,
+            "experiment_comment": str,
             # Additional text comment that describes the slice.
-            "slice_comment": np.unicode_,
+            "slice_comment": str,
             # Number of slices in the experiment at this integration time.
             "num_slices": np.int64,
             # Three letter radar identifier.
-            "station": np.unicode_,
+            "station": str,
             # Number of sampling periods in the integration time.
             "num_sequences": np.int64,
             # range gate separation (equivalent distance between samples), km.
@@ -286,7 +286,7 @@ def single_element_types(cls):
             # Sampling rate of the samples being written to file in Hz.
             "rx_sample_rate": np.float64,
             # Designates if the record is the first in a scan.
-            "scan_start_marker": np.bool_,
+            "scan_start_marker": np.uint8,
             # Integration time in seconds.
             "int_time": np.float32,
             # Length of the pulse in microseconds.
@@ -302,7 +302,7 @@ def single_element_types(cls):
             "freq": np.uint32,
             # str denoting C data type of the samples included in the data
             # array, such as 'complex float'.
-            "samples_data_type": np.unicode_,
+            "samples_data_type": str,
             # data normalization factor determined by the filter scaling in the
             # decimation scheme.
             "data_normalization_factor": np.float64,
@@ -672,25 +672,25 @@ def single_element_types(cls):
         return {
             # Identifies the version of Borealis that made this data. Necessary
             # for all versions.
-            "borealis_git_hash": np.unicode_,
+            "borealis_git_hash": str,
             # Number used to identify experiment.
             "experiment_id": np.int64,
             # Name of the experiment file.
-            "experiment_name": np.unicode_,
+            "experiment_name": str,
             # Comment about the whole experiment
-            "experiment_comment": np.unicode_,
+            "experiment_comment": str,
             # Additional text comment that describes the slice.
-            "slice_comment": np.unicode_,
+            "slice_comment": str,
             # Number of slices in the experiment at this integration time.
             "num_slices": np.int64,
             # Three letter radar identifier.
-            "station": np.unicode_,
+            "station": str,
             # Number of sampling periods in the integration time.
             "num_sequences": np.int64,
             # Sampling rate of the samples being written to file in Hz.
             "rx_sample_rate": np.float64,
             # Designates if the record is the first in a scan.
-            "scan_start_marker": np.bool_,
+            "scan_start_marker": np.uint8,
             # Integration time in seconds.
             "int_time": np.float32,
             # Length of the pulse in microseconds.
@@ -706,7 +706,7 @@ def single_element_types(cls):
             "freq": np.uint32,
             # str denoting C data type of the samples included in the data
             # array, such as 'complex float'.
-            "samples_data_type": np.unicode_,
+            "samples_data_type": str,
             # Number of samples in the sampling period.
             "num_samps": np.uint32,
             # range gate separation (equivalent distance between samples), km
@@ -1074,25 +1074,25 @@ def single_element_types(cls):
         return {
             # Identifies the version of Borealis that made this data. Necessary
             # for all versions.
-            "borealis_git_hash": np.unicode_,
+            "borealis_git_hash": str,
             # Number used to identify experiment.
             "experiment_id": np.int64,
             # Name of the experiment file.
-            "experiment_name": np.unicode_,
+            "experiment_name": str,
             # Comment about the whole experiment
-            "experiment_comment": np.unicode_,
+            "experiment_comment": str,
             # Additional text comment that describes the slice.
-            "slice_comment": np.unicode_,
+            "slice_comment": str,
             # Number of slices in the experiment at this integration time.
             "num_slices": np.int64,
             # Three letter radar identifier.
-            "station": np.unicode_,
+            "station": str,
             # Number of sampling periods in the integration time.
             "num_sequences": np.int64,
             # Sampling rate of the samples being written to file in Hz.
             "rx_sample_rate": np.float64,
             # Designates if the record is the first in a scan.
-            "scan_start_marker": np.bool_,
+            "scan_start_marker": np.uint8,
             # Integration time in seconds.
             "int_time": np.float32,
             # Length of the pulse in microseconds.
@@ -1108,7 +1108,7 @@ def single_element_types(cls):
             "freq": np.uint32,
             # str denoting C data type of the samples included in the data
             # array, such as 'complex float'.
-            "samples_data_type": np.unicode_,
+            "samples_data_type": str,
             # Number of samples in the sampling period.
             "num_samps": np.uint32,
             # data normalization factor determined by the filter scaling in the
@@ -1402,23 +1402,23 @@ def single_element_types(cls):
         return {
             # Identifies the version of Borealis that made this data. Necessary
             # for all versions.
-            "borealis_git_hash": np.unicode_,
+            "borealis_git_hash": str,
             # Number used to identify experiment.
             "experiment_id": np.int64,
             # Name of the experiment file.
-            "experiment_name": np.unicode_,
+            "experiment_name": str,
             # Comment about the whole experiment
-            "experiment_comment": np.unicode_,
+            "experiment_comment": str,
             # Number of slices in the experiment at this integration time.
             "num_slices": np.int64,
             # Three letter radar identifier.
-            "station": np.unicode_,
+            "station": str,
             # Number of sampling periods in the integration time.
             "num_sequences": np.int64,
             # Sampling rate of the samples being written to file in Hz.
             "rx_sample_rate": np.float64,
             # Designates if the record is the first in a scan.
-            "scan_start_marker": np.bool_,
+            "scan_start_marker": np.uint8,
             # Integration time in seconds.
             "int_time": np.float32,
             # Number of main array antennas.
@@ -1427,7 +1427,7 @@ def single_element_types(cls):
             "intf_antenna_count": np.uint32,
             # str denoting C data type of the samples included in the data
             # array, such as 'complex float'.
-            "samples_data_type": np.unicode_,
+            "samples_data_type": str,
             # The center frequency of this data in kHz
             "rx_center_freq": np.float64,
             # Number of samples in the sampling period.
@@ -1513,12 +1513,12 @@ def single_element_types(cls):
             # the slice id of the file and dataset.
             "slice_id": np.uint32,
             # the interfacing of this slice to other slices.
-            "slice_interfacing": np.unicode_,
+            "slice_interfacing": str,
             # A string describing the type of scheduling time at the time of
             # this dataset.
-            "scheduling_mode": np.unicode_,
+            "scheduling_mode": str,
             # A string describing the averaging method, ex. mean, median
-            "averaging_method": np.unicode_,
+            "averaging_method": str,
             # number of blanked samples in the sequence.
             "num_blanked_samples": np.uint32
             })
@@ -1663,10 +1663,10 @@ def single_element_types(cls):
             # the slice id of the file and dataset.
             "slice_id": np.uint32,
             # the interfacing of this slice to other slices.
-            "slice_interfacing": np.unicode_,
+            "slice_interfacing": str,
             # A string describing the type of scheduling time at the time of
             # this dataset.
-            "scheduling_mode": np.unicode_,
+            "scheduling_mode": str,
             # number of blanked samples in the sequence.
             "num_blanked_samples": np.uint32
             })
@@ -1806,10 +1806,10 @@ def single_element_types(cls):
             # the slice id of the file and dataset.
             "slice_id": np.uint32,
             # the interfacing of this slice to other slices.
-            "slice_interfacing": np.unicode_,
+            "slice_interfacing": str,
             # A string describing the type of scheduling time at the time of
             # this dataset.
-            "scheduling_mode": np.unicode_,
+            "scheduling_mode": str,
             # number of blanked samples in the sequence.
             "num_blanked_samples": np.uint32
             })
@@ -1955,7 +1955,7 @@ def single_element_types(cls):
         single_element_types.update({
             # A string describing the type of scheduling time at the time of
             # this dataset.
-            "scheduling_mode": np.unicode_
+            "scheduling_mode": str
             })
         return single_element_types
 
@@ -2044,7 +2044,7 @@ def single_element_types(cls):
             "lp_status_word": np.uint32,
             # Boolean indicating if the GPS was locked during the entire
             # integration period
-            "gps_locked": np.bool_,
+            "gps_locked": np.uint8,
             # The max time diffe between GPS and system time during the
             # integration period. In seconds. Negative if GPS time ahead.
             "gps_to_system_time_diff": np.float64,
@@ -2152,7 +2152,7 @@ def single_element_types(cls):
             "lp_status_word": np.uint32,
             # Boolean indicating if the GPS was locked during the entire
             # integration period
-            "gps_locked": np.bool_,
+            "gps_locked": np.uint8,
             # The max time diffe between GPS and system time during the
             # integration period. In seconds. Negative if GPS time ahead.
             "gps_to_system_time_diff": np.float64,
@@ -2267,7 +2267,7 @@ def single_element_types(cls):
             "lp_status_word": np.uint32,
             # Boolean indicating if the GPS was locked during the entire
             # integration period
-            "gps_locked": np.bool_,
+            "gps_locked": np.uint8,
             # The max time diffe between GPS and system time during the
             # integration period. In seconds. Negative if GPS time ahead.
             "gps_to_system_time_diff": np.float64,
@@ -2374,7 +2374,7 @@ def single_element_types(cls):
             "lp_status_word": np.uint32,
             # Boolean indicating if the GPS was locked during the entire
             # integration period
-            "gps_locked": np.bool_,
+            "gps_locked": np.uint8,
             # The max time diffe between GPS and system time during the
             # integration period. In seconds. Negative if GPS time ahead.
             "gps_to_system_time_diff": np.float64,
diff --git a/pydarnio/borealis/borealis_restructure.py b/pydarnio/borealis/borealis_restructure.py
index 228d608..536ea61 100755
--- a/pydarnio/borealis/borealis_restructure.py
+++ b/pydarnio/borealis/borealis_restructure.py
@@ -34,9 +34,7 @@
 import os
 import subprocess as sp
 import warnings
-from pathlib import Path
 import h5py
-import deepdish as dd
 import logging
 import numpy as np
 from datetime import datetime
@@ -208,57 +206,64 @@ def _array_to_site_restructure(self):
         attribute_types = self.format.site_single_element_types()
         dataset_types = self.format.array_dtypes()
         try:
-            shared_fields_dict = dict()
-            # shared fields are common across records, so this is done once
-            for field in self.format.shared_fields():
-                field_data = dd.io.load(self.infile_name, '/{}'.format(field))
-                shared_fields_dict[field] = field_data
-
-            unshared_single_elements = dict()
-            # These are fields which have one element per record, so the
-            # arrays are small enough to be loaded completely into memory
-            for field in self.format.unshared_fields():
-                if field in self.format.single_element_types():
-                    unshared_single_elements[field] = dd.io.load(
-                        self.infile_name, '/{}'.format(field))
-
-            sqn_timestamps_array = dd.io.load(self.infile_name,
-                                              '/sqn_timestamps')
-            for record_num, seq_timestamp in enumerate(sqn_timestamps_array):
-                # format dictionary key in the same way it is done
-                # in datawrite on site
-                seq_datetime = datetime.utcfromtimestamp(seq_timestamp[0])
-                epoch = datetime.utcfromtimestamp(0)
-                key = str(int((seq_datetime - epoch).total_seconds() * 1000))
-
-                # Make this fresh every time, to reduce memory footprint
-                record_dict = dict()
-
-                # Copy over the shared fields
-                for k, v in shared_fields_dict.items():
-                    record_dict[k] = v
-
-                # populate site specific fields using given functions
-                # that take both the arrays data and the record number
-                with h5py.File(self.infile_name, 'r') as f:
+            with h5py.File(self.infile_name, 'r') as f:
+
+                # shared fields are common across records, so this is done once
+                shared_fields_dict = dict()
+                for field in self.format.shared_fields():
+                    if field in attribute_types:
+                        data = f.attrs[field]
+                        if isinstance(data, bytes):
+                            data = data.decode('utf-8')
+                    elif field in self.format.array_string_fields():
+                        dset = f[field]
+                        itemsize = dset.attrs['itemsize']
+                        data = dset[:].view(dtype=(np.unicode_, itemsize))
+                    else:
+                        data = f[field][:]
+                    shared_fields_dict[field] = data
+
+                # These are fields which have one element per record, so the
+                # arrays are small enough to be loaded completely into memory
+                unshared_single_elements = dict()
+                for field in self.format.unshared_fields():
+                    if field in self.format.single_element_types():
+                        unshared_single_elements[field] = f[field][:]
+
+                sqn_timestamps_array = f['sqn_timestamps'][:]
+
+                for record_num, seq_timestamp in enumerate(sqn_timestamps_array):
+                    # format dictionary key in the same way it is done
+                    # in datawrite on site
+                    seq_datetime = datetime.utcfromtimestamp(seq_timestamp[0])
+                    epoch = datetime.utcfromtimestamp(0)
+                    key = str(int((seq_datetime - epoch).total_seconds() * 1000))
+
+                    # Make this fresh every time, to reduce memory footprint
+                    record_dict = dict()
+
+                    # Copy over the shared fields
+                    for k, v in shared_fields_dict.items():
+                        record_dict[k] = v
+
+                    # populate site specific fields using given functions
+                    # that take both the arrays data and the record number
                     for field in self.format.site_specific_fields():
                         record_dict[field] = \
                             self.format.site_specific_fields_generate(
                                 )[field](f, record_num)
 
-                for field in self.format.unshared_fields():
-                    if field in self.format.single_element_types():
-                        datatype = self.format.single_element_types()[field]
-                        # field is not an array, single element per record.
-                        # unshared_field_dims_site should give empty list.
-                        record_dict[field] = \
-                            datatype(unshared_single_elements[field][
-                                         record_num])
-                    else:  # field in array_dtypes
-                        # need to get the dims correct,
-                        # not always equal to the max
-                        field_flag = False
-                        with h5py.File(self.infile_name, 'r') as f:
+                    for field in self.format.unshared_fields():
+                        if field in self.format.single_element_types():
+                            datatype = self.format.single_element_types()[field]
+                            # field is not an array, single element per record.
+                            # unshared_field_dims_site should give empty list.
+                            record_dict[field] = \
+                                datatype(unshared_single_elements[field][
+                                             record_num])
+                        else:  # field in array_dtypes
+                            # need to get the dims correct, not always equal to the max
+                            field_flag = False
                             site_dims = [dimension_function(f, record_num)
                                          for dimension_function in
                                          self.format.unshared_fields_dims_site(
@@ -277,18 +282,16 @@ def _array_to_site_restructure(self):
                             index_slice = [slice(0, i) for i in site_dims if i != -1]
                             index_slice.insert(0, record_num)
                             index_slice = tuple(index_slice)
-                            # If there was an incorrect dimension (-1 in dims), then use deepdish to extract the field
-                            if field_flag:
-                                record_dict[field] = dd.io.load(self.infile_name, f'/{field}')[index_slice]
-                            else:
-                                record_dict[field] = f[field][index_slice]
-                # Wrap in another dict to use the format method
-                record_dict = OrderedDict({key: record_dict})
-                record_dict = self.format.flatten_site_arrays(record_dict)
-
-                # Write the single record to file
-                self._write_borealis_record(record_dict, key, attribute_types,
-                                            dataset_types)
+                            record_dict[field] = f[field][index_slice]
+
+                    # Wrap in another dict to use the format method
+                    record_dict = OrderedDict({key: record_dict})
+                    record_dict = self.format.flatten_site_arrays(record_dict)
+                    BorealisUtilities.check_records(self.infile_name, record_dict, attribute_types, dataset_types)
+
+                    # Write the single record to file
+                    self.format.write_records(self.outfile_name, record_dict, attribute_types, dataset_types,
+                                              self.compression)
         except Exception as err:
             raise borealis_exceptions.BorealisRestructureError(
                 'Records for {}: Error restructuring {} from array to site '
@@ -333,7 +336,7 @@ def _site_to_array_restructure(self):
                     rec_dict.update({k: record.attrs[k] for k in rec_attrs})
                     # Bitwise fields also need to be handled separately
                     for field in self.format.bool_types():
-                        rec_dict[field] = dd.io.load(self.infile_name, f'/{record_name}/{field}')
+                        rec_dict[field] = f[record_name][field]
 
                     # some fields are linear in site style and need to be reshaped.
                     # Pass in record nested in a dictionary, as
@@ -359,8 +362,9 @@ def _site_to_array_restructure(self):
                                 else:
                                     raise TypeError(f'Field {field} has unrecognized data: {value}')
                             elif field in self.format.array_string_fields():
-                                # h5py reads numpy string arrays as contiguous unsigned ints, so we need deepdish here
-                                new_data_dict[field] = dd.io.load(self.infile_name, f'/{record_name}/{field}')
+                                dset = f[record_name][field]
+                                itemsize = dset.attrs['itemsize']
+                                new_data_dict[field] = dset[:].view(dtype=(np.unicode_, itemsize))
                             else:
                                 raise TypeError(f'Field {field} unrecognized')
 
@@ -374,7 +378,7 @@ def _site_to_array_restructure(self):
                                 # Initialize array now with correct data type.
                                 dtype = self.format.single_element_types()[field]
                                 new_data_dict[field] = np.empty(num_records, dtype=dtype)
-                                if dtype is np.int64 or dtype is np.uint32:
+                                if dtype in [np.int64, np.uint32, np.uint8]:
                                     new_data_dict[field][:] = -1
                                 else:
                                     new_data_dict[field][:] = np.NaN
@@ -396,7 +400,7 @@ def _site_to_array_restructure(self):
                                 datatype = self.format.single_element_types()[field]
                             else:  # field in array_dtypes
                                 datatype = self.format.array_dtypes()[field]
-                            if datatype == np.unicode_:
+                            if datatype == str:
                                 # unicode type needs to be explicitly set to
                                 # have multiple chars (256)
                                 datatype = '|U256'
@@ -406,7 +410,7 @@ def _site_to_array_restructure(self):
                             # change between records), so they are initialized
                             # with a known value first. Initialize floating-
                             # point values to NaN, and integer values to -1.
-                            if datatype is np.int64 or datatype is np.uint32:
+                            if datatype in [np.int64, np.uint32, np.uint8]:
                                 empty_array[:] = -1
                             else:
                                 empty_array[:] = np.NaN
@@ -433,65 +437,13 @@ def _site_to_array_restructure(self):
             attribute_types = self.format.array_single_element_types()
             dataset_types = self.format.array_array_dtypes()
             unshared_fields = self.format.unshared_fields()
-            BorealisUtilities.check_arrays(self.infile_name, new_data_dict,
-                                           attribute_types, dataset_types,
+            BorealisUtilities.check_arrays(self.infile_name, new_data_dict, attribute_types, dataset_types,
                                            unshared_fields)
-            dd.io.save(self.outfile_name, new_data_dict,
-                       compression=self.compression)
+            self.format.write_arrays(self.outfile_name, new_data_dict, attribute_types, dataset_types, unshared_fields,
+                                     self.compression)
 
         except TypeError as err:
             raise borealis_exceptions.BorealisRestructureError(
                 'Records for {}: Error restructuring {} from site to array '
                 'style: {}'.format(self.infile_name, self.format.__name__, err)
             ) from err
-
-    def _write_borealis_record(self, record: dict, record_name: str,
-                               attribute_types: dict, dataset_types: dict):
-        """
-        Add a record to the output file in site style after checking the record.
-
-        Several Borealis field checks are done to insure the integrity of the
-        record.
-
-        Parameters
-        ----------
-        record: dict
-            Dictionary containing the site-structured record.
-        record_name: str
-            Group name of the record for the HDF5 hierarchy.
-        attribute_types: dict
-            Dictionary with the required types for the attributes in the file.
-        dataset_types: dict
-            Dictionary with the required dtypes for the numpy arrays in the
-            file.
-
-        Raises
-        ------
-        BorealisFieldMissingError
-        BorealisExtraFieldError
-        BorealisDataFormatTypeError
-
-        See Also
-        --------
-        BorealisUtilities
-        """
-        Path(self.outfile_name).touch()
-        BorealisUtilities.check_records(self.infile_name, record,
-                                        attribute_types, dataset_types)
-
-        # use external h5copy utility to move new record into 2hr file.
-
-        warnings.filterwarnings("ignore")
-        # Must use temporary file to append to a file; writing entire
-        # dictionary at once also doesn't work so this is required.
-        tmp_filename = self.outfile_name + '.tmp'
-        Path(tmp_filename).touch()
-
-        dd.io.save(tmp_filename, record[record_name],
-                   compression=self.compression)
-        f = dd.io.load(tmp_filename, '/')
-        cp_cmd = 'h5copy -i {newfile} -o {full_file} -s / -d {dtstr}'
-        cmd = cp_cmd.format(newfile=tmp_filename, full_file=self.outfile_name,
-                            dtstr=record_name)
-        sp.run(cmd.split())
-        os.remove(tmp_filename)
diff --git a/pydarnio/borealis/borealis_site.py b/pydarnio/borealis/borealis_site.py
index 0c5b2b3..473ae4a 100644
--- a/pydarnio/borealis/borealis_site.py
+++ b/pydarnio/borealis/borealis_site.py
@@ -35,15 +35,14 @@
 Add compression to bzip2
 
 """
-import deepdish as dd
 import h5py
 import logging
 import os
 import subprocess as sp
 import warnings
+import numpy as np
 
 from collections import OrderedDict
-from pathlib2 import Path
 from typing import Union
 
 from pydarnio import borealis_exceptions, borealis_formats
@@ -125,11 +124,11 @@ def __init__(self, filename: str, borealis_filetype: str):
         # 'vX.X'
 
         try:
-            version = dd.io.load(self.filename,
-                                 group='/'+self._record_names[0]
-                                 )['borealis_git_hash'].split('-')[0]
-            version = '.'.join(version.split('.')[:2])      # vX.Y, ignore patch revision
-        except (IndexError, ValueError) as err:
+            with h5py.File(self.filename, 'r') as f:
+                first_rec = f[self._record_names[0]]
+                full_version = first_rec.attrs['borealis_git_hash'].decode('utf-8').split('-')[0]
+                version = '.'.join(full_version.split('.')[:2])      # vX.Y, ignore patch revision
+        except (IndexError, KeyError) as err:
             # if this is an array style file, it will raise
             # IndexError on the array.
             raise borealis_exceptions.BorealisStructureError(
@@ -247,36 +246,9 @@ def read_file(self) -> dict:
         records: OrderedDict{dict}
             records of Borealis rawacf data. Keys are first sequence timestamp
             (in ms since epoch).
-        """
-        pyDARNio_log.info("Reading Borealis {} {} file: {}"
-                          "".format(self.software_version,
-                                    self.borealis_filetype, self.filename))
-
-        attribute_types = self.format.site_single_element_types()
-        dataset_types = self.format.site_array_dtypes()
-
-        self._read_borealis_records(attribute_types, dataset_types)
-        return self._records
-
-    def _read_borealis_records(self, attribute_types: dict,
-                               dataset_types: dict):
-        """
-        Read the entire file while checking all data fields.
-
-        Several Borealis field checks are done to insure the integrity of the
-        file.
-
-        Parameters
-        ----------
-        attribute_types: dict
-            Dictionary with the required types for the attributes in the file.
-        dataset_types: dict
-            Dictionary with the require dtypes for the numpy arrays in the
-            file.
 
         Raises
         ------
-        OSError: file does not exist
         BorealisFieldMissingError - when a field is missing from the Borealis
                                 file/stream type
         BorealisExtraFieldError - when an extra field is present in the
@@ -288,11 +260,19 @@ def _read_borealis_records(self, attribute_types: dict,
         --------
         BorealisUtilities
         """
-        records = dd.io.load(self.filename)
+        pyDARNio_log.info("Reading Borealis {} {} file: {}"
+                          "".format(self.software_version,
+                                    self.borealis_filetype, self.filename))
+
+        attribute_types = self.format.site_single_element_types()
+        dataset_types = self.format.site_array_dtypes()
+
+        records = self.format.read_records(self.filename)
         BorealisUtilities.check_records(self.filename, records,
                                         attribute_types, dataset_types)
 
         self._records = OrderedDict(sorted(records.items()))
+        return self._records
 
 
 class BorealisSiteWrite():
@@ -487,34 +467,8 @@ def write_file(self) -> str:
 
         Returns
         -------
-        filename
+        filename: str
             The filename written to.
-        """
-        pyDARNio_log.info("Writing Borealis {} {} file: {}"
-                          "".format(self.software_version,
-                                    self.borealis_filetype, self.filename))
-
-        attribute_types = self.format.site_single_element_types()
-        dataset_types = self.format.site_array_dtypes()
-
-        self._write_borealis_records(attribute_types, dataset_types)
-        return self.filename
-
-    def _write_borealis_records(self, attribute_types: dict,
-                                dataset_types: dict):
-        """
-        Write the file in site style after checking records.
-
-        Several Borealis field checks are done to insure the integrity of the
-        file.
-
-        Parameters
-        ----------
-        attributes_type_dict: dict
-            Dictionary with the required types for the attributes in the file.
-        datasets_type_dict: dict
-            Dictionary with the require dtypes for the numpy arrays in the
-            file.
 
         Raises
         ------
@@ -524,27 +478,15 @@ def _write_borealis_records(self, attribute_types: dict,
                                 Borealis file/stream type
         BorealisDataFormatTypeError - when a field has the incorrect
                                 field type for the Borealis file/stream type
-
-        See Also
-        --------
-        BorealisUtilities
         """
-        Path(self.filename).touch()
+        pyDARNio_log.info("Writing Borealis {} {} file: {}"
+                          "".format(self.software_version,
+                                    self.borealis_filetype, self.filename))
+
+        attribute_types = self.format.site_single_element_types()
+        dataset_types = self.format.site_array_dtypes()
         BorealisUtilities.check_records(self.filename, self.records,
                                         attribute_types, dataset_types)
-
-        # use external h5copy utility to move new record into 2hr file.
-
-        warnings.filterwarnings("ignore")
-        # Must use temporary file to append to a file; writing entire
-        # dictionary at once also doesn't work so this is required.
-        tmp_filename = self.filename + '.tmp'
-        Path(tmp_filename).touch()
-        for group_name, group_dict in self.records.items():
-            dd.io.save(tmp_filename, {str(group_name): group_dict},
-                       compression=self.compression)
-            cp_cmd = 'h5copy -i {newfile} -o {full_file} -s {dtstr} -d {dtstr}'
-            cmd = cp_cmd.format(newfile=tmp_filename, full_file=self.filename,
-                                dtstr='/'+str(group_name))
-            sp.call(cmd.split())
-            os.remove(tmp_filename)
+        self.format.write_records(self.filename, self.records, attribute_types,
+                                  dataset_types, self.compression)
+        return self.filename
diff --git a/pydarnio/borealis/borealis_utilities.py b/pydarnio/borealis/borealis_utilities.py
index 10929c2..bc7e8fc 100644
--- a/pydarnio/borealis/borealis_utilities.py
+++ b/pydarnio/borealis/borealis_utilities.py
@@ -28,7 +28,6 @@
 
 """
 import logging
-import deepdish as dd
 import h5py
 import numpy as np
 import sys
@@ -263,7 +262,8 @@ def record_incorrect_types_check(filename: str, attributes_type_dict: dict,
         incorrect_types_check = {param: str(attributes_type_dict[param])
                                  for param in attributes_type_dict.keys()
                                  if type(record[param]) !=
-                                 attributes_type_dict[param]}
+                                 attributes_type_dict[param] and
+                                 record[param].shape is not None}
 
         incorrect_types_check.update({param: 'np.ndarray of ' +
                                       str(datasets_type_dict[param])
@@ -322,7 +322,8 @@ def array_incorrect_types_check(filename: str, attributes_type_dict: dict,
         incorrect_types_check = {param: str(attributes_type_dict[param])
                                  for param in attributes_type_dict.keys()
                                  if type(file_data[param]) !=
-                                 attributes_type_dict[param]}
+                                 attributes_type_dict[param] and
+                                 file_data[param].shape is not None}
 
         datasets_type_dict_keys = sorted(list(datasets_type_dict.keys()))
         np_array_types = [isinstance(file_data[param], np.ndarray) for param in
@@ -343,7 +344,8 @@ def array_incorrect_types_check(filename: str, attributes_type_dict: dict,
                                       str(datasets_type_dict[param])
                                       for param in datasets_type_dict.keys()
                                       if file_data[param].dtype.type !=
-                                      datasets_type_dict[param]})
+                                      datasets_type_dict[param] and
+                                      file_data[param].dtype.type != np.str_})
         if len(incorrect_types_check) > 0:
             raise borealis_exceptions.\
                     BorealisDataFormatTypeError(filename,
@@ -560,19 +562,21 @@ def get_borealis_version(filename: str, record_names, structure: str):
         """
         if structure == 'array':
             try:
-                borealis_git_hash = dd.io.load(filename,
-                                               group='/borealis_git_hash')
-            except ValueError as err:
+                with h5py.File(filename, 'r') as f:
+                    borealis_git_hash = f.attrs['borealis_git_hash'].decode('utf-8')
+            except KeyError as err:
                 raise borealis_exceptions.BorealisStructureError(
                     ' {} Could not find the borealis_git_hash required to '
                     'determine file version. Data file may be corrupted. {}'
                     ''.format(filename, err)) from err
         elif structure == 'site':
             try:
-                borealis_git_hash = \
-                    dd.io.load(filename, group='/{}/borealis_git_hash'
-                                               ''.format(record_names[0]))
-            except ValueError as err:
+                with h5py.File(filename, 'r') as f:
+                    records = sorted(list(f.keys()))
+                    first_rec = f[records[0]]
+                    borealis_git_hash = first_rec.attrs['borealis_git_hash']\
+                                            .decode('utf-8')
+            except KeyError as err:
                 raise borealis_exceptions.BorealisStructureError(
                     ' {} Could not find the borealis_git_hash required to '
                     'determine file version. Data file may be corrupted. {}'
diff --git a/setup.cfg b/setup.cfg
index 0e674e9..68fcbcd 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -20,7 +20,6 @@ install_requires =
     pyyaml
     numpy
     h5py>=3.3.0
-    deepdish
     pathlib2
 
 [options.packages.find]
@@ -28,4 +27,4 @@ exclude =
     test*
     test_files*
     docs*
-    build*
\ No newline at end of file
+    build*
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..7f1901d
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,56 @@
+"""
+Copyright 2018 SuperDARN Canada, University of Saskatchewan
+
+setup.py
+2018-11-05
+To setup pyDARNio as a third party library. Include installing need libraries for
+running the files.
+
+author:
+Marina Schmidt
+
+Disclaimer:
+pyDARNio is under the LGPL v3 license found in the root directory LICENSE.md
+Everyone is permitted to copy and distribute verbatim copies of this license
+document, but changing it is not allowed.
+
+This version of the GNU Lesser General Public License incorporates the terms
+and conditions of version 3 of the GNU General Public License,
+supplemented by the additional permissions listed below.
+
+"""
+
+from os import path
+from setuptools import setup, find_packages
+import sys
+from subprocess import check_call
+from setuptools.command.install import install, orig
+
+this_directory = path.abspath(path.dirname(__file__))
+with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
+    long_description = f.read()
+
+
+# Setup information
+setup(
+    name="pydarnio",
+    version="1.2.1",
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    description="Python library for reading and writing SuperDARN data",
+    url='https://github.com/SuperDARN/pyDARNio.git',
+    classifiers=[
+        'Development Status :: 5 - Production/Stable',
+        'License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7'],
+    python_requires='>=3.6',
+    packages=find_packages(exclude=['docs', 'test']),
+    author="SuperDARN",
+    include_package_data=True,
+    setup_requires=['pyyaml', 'numpy',
+                    'h5py>=3.3.0', 'pathlib2'],
+    # pyyaml library install
+    install_requires=['pyyaml', 'numpy',
+                      'h5py>=3.3.0', 'deepdish', 'pathlib2']
+)