Merge pull request pysat#1123 from pysat/expand_dims

Expand xarray dims when concatenating data
JonathonMSmith · May 31, 2023 · 6a9ecc7 · 6a9ecc7
2 parents 0d66579 + 1322c4d
commit 6a9ecc7
Show file tree

Hide file tree

Showing 6 changed files with 344 additions and 17 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,8 @@ This project adheres to [Semantic Versioning](https://semver.org/).
 [3.1.0] - 2023-06-02
 --------------------
 * New Features
+  * Added a utility to ensure two xarray Datasets can be concatonated, and
+    incorporated this utility into the Instrument file loading
   * Added unit tests for different file cadences in the Instrument class
   * Added `to_inst` method to the Constellation class
   * Added `export_pysat_info` kwarg to `to_netcdf` routines to select whether

diff --git a/pysat/_instrument.py b/pysat/_instrument.py
@@ -2389,9 +2389,24 @@ def concat_data(self, new_data, prepend=False, **kwargs):
                 kwargs['sort'] = False
             concat_func = pds.concat
         else:
+            # Ensure the dimensions are equal
+            equal_dims = True
+            idat = 0
+            while idat < len(new_data) - 1 and equal_dims:
+                if new_data[idat].dims != new_data[idat + 1].dims:
+                    equal_dims = False
+                idat += 1
+
+            if not equal_dims:
+                # Update the dimensions, padding data where necessary
+                new_data = pysat.utils.coords.expand_xarray_dims(
+                    new_data, self.meta, exclude_dims=['time'])
+
             # Specify the dimension, if not otherwise specified
             if 'dim' not in kwargs:
                 kwargs['dim'] = self.index.name
+
+            # Set the concat function
             concat_func = xr.concat
 
         # Assign the concatenated data to the instrument

diff --git a/pysat/_meta.py b/pysat/_meta.py
@@ -428,7 +428,8 @@ def __setitem__(self, data_vars, input_dat):
                                 # If this is a disagreement between byte data
                                 # and an expected str, resolve it here
                                 if(isinstance(to_be_set, bytes)
-                                   and self.labels.label_type[iattr] == str):
+                                   and str in pysat.utils.listify(
+                                       self.labels.label_type[iattr])):
                                     to_be_set = core_utils.stringify(to_be_set)
                                 else:
                                     # This type is incorrect, try casting it
@@ -440,16 +441,18 @@ def __setitem__(self, data_vars, input_dat):
                                                         iattr])])
                                     try:
                                         if hasattr(to_be_set, '__iter__'):
-                                            if self.labels.label_type[
-                                                    iattr] == str:
+                                            if str in pysat.utils.listify(
+                                                    self.labels.label_type[
+                                                    iattr]):
                                                 to_be_set = '\n\n'.join(
                                                     [str(tval) for tval in
                                                      to_be_set])
                                             else:
                                                 raise TypeError("can't recast")
                                         else:
-                                            to_be_set = self.labels.label_type[
-                                                iattr](to_be_set)
+                                            to_be_set = pysat.utils.listify(
+                                                self.labels.label_type[
+                                                    iattr])[0](to_be_set)
 
                                         # Inform user data was recast
                                         pysat.logger.info(''.join((
@@ -838,17 +841,22 @@ def _insert_default_values(self, data_var, data_type=None):
             var_types = pysat.utils.listify(data_type)
 
         for i, var in enumerate(data_vars):
-            if name_idx is not None:
-                default_vals[name_idx] = var
-
+            # Use the label defaults if this variable doesn't need to consider
+            # the data type
             if not np.any(list(need_data_type.values())):
-                self._data.loc[var, labels] = default_vals
+                data_default = list(default_vals)
             else:
                 data_default = [
                     self.labels.default_values_from_attr(
                         lattrs[j], var_types[i]) if need_data_type[lattrs[j]]
                     else val for j, val in enumerate(default_vals)]
-                self._data.loc[var, labels] = data_default
+
+            # The default value for the name must be set after to be consistent
+            if name_idx is not None:
+                data_default[name_idx] = var
+
+            # Update the meta data to the desired defaults
+            self._data.loc[var, labels] = data_default
 
         return
 
@@ -1853,6 +1861,10 @@ def _update_label_types(self):
             elif self.label_type[lkey] == int:
                 self.label_type[lkey] = (int, np.int64, np.int32, np.int16,
                                          np.int8, bool)
+            elif self.label_type[lkey] == str:
+                self.label_type[lkey] = (str, np.str_)
+            elif self.label_type[lkey] == bool:
+                self.label_type[lkey] = (bool, np.bool_)
             elif isinstance(self.label_type[lkey], tuple):
                 ltypes = list(self.label_type[lkey])
 
@@ -1862,6 +1874,12 @@ def _update_label_types(self):
                 if int in ltypes:
                     ltypes.extend([np.int64, np.int32, np.int16, np.int8, bool])
 
+                if str in ltypes:
+                    ltypes.append(np.str_)
+
+                if bool in ltypes:
+                    ltypes.append(np.bool_)
+
                 # This may result in duplicate numpy types, but order is more
                 # important than carrying around a duplicate type, as the first
                 # type in the provided tuple is the default type

diff --git a/pysat/tests/test_meta.py b/pysat/tests/test_meta.py
@@ -411,8 +411,9 @@ def test_set_meta_with_wrong_type_cast(self, bad_val, caplog):
 
         # Test the warning
         captured = caplog.text
-        assert captured.find('Metadata with type') >= 0
-        assert captured.find('Recasting input') >= 0
+        estr = "missing expected message in: {:}".format(captured)
+        assert captured.find('Metadata with type') >= 0, estr
+        assert captured.find('Recasting input') >= 0, estr
 
         # Check that meta is set
         if hasattr(bad_val, "__iter__"):
@@ -738,8 +739,8 @@ def test_meta_assignment(self, custom_attr, assign_type):
         self.dval = 'test_meta_dict_assignment'
         self.default_val = {
             getattr(self.meta.labels, mattr): ' '.join(['test', mattr])
-            if self.meta.labels.label_type[mattr] == str else -47
-            for mattr in self.meta.labels.label_type.keys()}
+            if str in pysat.utils.listify(self.meta.labels.label_type[mattr])
+            else -47 for mattr in self.meta.labels.label_type.keys()}
         self.default_name = []
         self.default_nan = []
 
@@ -771,9 +772,9 @@ def test_multiple_meta_assignment(self, custom_attr, assign_type):
         dvals = ['mult1', 'mult2']
         default_vals = {
             getattr(self.meta.labels, mattr): [
-                ' '.join(['test', mattr, self.dval])
-                if self.meta.labels.label_type[mattr] == str else -47
-                for self.dval in dvals]
+                ' '.join(['test', mattr, self.dval]) if str
+                in pysat.utils.listify(self.meta.labels.label_type[mattr])
+                else -47 for self.dval in dvals]
             for mattr in self.meta.labels.label_type.keys()}
         self.default_name = []
         self.default_nan = []

diff --git a/pysat/tests/test_utils_coords.py b/pysat/tests/test_utils_coords.py
@@ -359,3 +359,194 @@ def test_establish_common_coord_single_val_only(self):
         assert self.short_coord[0] == out[0], "unexpected value"
         assert len(out) == 1, "unexpected coordinate length"
         return
+
+
+class TestExpandXarrayDims(object):
+    """Unit tests for the `expand_xarray_dims` function."""
+
+    def setup_method(self):
+        """Set up the unit test environment."""
+        self.test_inst = pysat.Instrument(
+            inst_module=pysat.instruments.pysat_ndtesting, use_header=True)
+        self.start_time = pysat.instruments.pysat_ndtesting._test_dates['']['']
+        self.data_list = []
+        self.out = None
+        self.meta = None
+        return
+
+    def teardown_method(self):
+        """Clean up the unit test environment."""
+        del self.test_inst, self.start_time, self.data_list, self.meta, self.out
+        return
+
+    def set_data_meta(self, dims_equal):
+        """Set the input data list and meta data.
+
+        Parameters
+        ----------
+        dims_equal : bool
+            If True, the dimension variables for the data sets should be the
+            same; if False they should have different dimensions apart from
+            the 'time' dimension
+
+        """
+
+        self.test_inst.load(date=self.start_time)
+        self.data_list.append(self.test_inst.data)
+        self.meta = self.test_inst.meta
+
+        # The second data set should have half the time samples
+        num_samples = int(self.test_inst.index.shape[0] / 2)
+
+        if dims_equal:
+            # Load a second data set with half the time samples
+            self.test_inst = pysat.Instrument(
+                inst_module=self.test_inst.inst_module,
+                num_samples=num_samples, use_header=True)
+        else:
+            # Load a second data set with different dimensions apart from time
+            self.test_inst = pysat.Instrument(
+                inst_module=pysat.instruments.pysat_testmodel,
+                num_samples=num_samples, use_header=True)
+
+        self.test_inst.load(date=self.start_time + dt.timedelta(days=1))
+        self.data_list.append(self.test_inst.data)
+
+        return
+
+    def eval_dims(self, dims_equal, exclude_dims=None, default_fill_val=None):
+        """Set the input data list and meta data.
+
+        Parameters
+        ----------
+        dims_equal : bool
+            If True, the dimension variables for the data sets should be the
+            same; if False they should have different dimensions apart from
+            the 'time' dimension
+        exclude_dims : list-like or NoneType
+            A list of dimensions that have the same name, but can have different
+            values or None if all the dimensions with the same name should
+            have the same shape. (default=None)
+        default_fill_val : any
+            The expected fill value for data variables not present in self.meta
+            (default=None)
+
+        """
+        if exclude_dims is None:
+            exclude_dims = []
+
+        # Define the reference Dataset
+        ref_dims = list(self.out[0].dims.keys())
+
+        # Cycle through the remaining Datasets
+        for i, xdata in enumerate(self.out[1:]):
+            test_dims = list(xdata.dims.keys())
+
+            # Test that the expected dimension names overlap between datasets
+            if dims_equal:
+                testing.assert_lists_equal(test_dims, ref_dims)
+            else:
+                for tdim in test_dims:
+                    assert (tdim == 'time' if tdim in ref_dims else tdim
+                            != 'time'), "unexpected dimension: {:}".format(tdim)
+
+            # Test the dimensions shapes for expected (lack of) differences
+            for tdim in test_dims:
+                if tdim in ref_dims:
+                    if tdim in exclude_dims:
+                        assert xdata[tdim].shape != self.out[0][tdim].shape
+                    else:
+                        assert xdata[tdim].shape == self.out[0][tdim].shape
+
+                        if xdata[tdim].shape != self.data_list[
+                                i + 1][tdim].shape:
+                            # This data set is smaller, test for fill values
+                            for dvar in xdata.data_vars.keys():
+                                if tdim in xdata[dvar].dims:
+                                    if dvar in self.meta:
+                                        fill_val = self.meta[
+                                            dvar, self.meta.labels.fill_val]
+                                    else:
+                                        fill_val = default_fill_val
+
+                                    try:
+                                        if np.isnan(fill_val):
+                                            assert np.isnan(
+                                                xdata[dvar].values).any()
+                                        else:
+                                            assert np.any(xdata[dvar].values
+                                                          == fill_val)
+                                    except TypeError:
+                                        # This is a string or object
+                                        estr = "".join([
+                                            "Bad or missing fill values for ",
+                                            dvar, ": ({:} not in {:})".format(
+                                                fill_val, xdata[dvar].values)])
+                                        if fill_val is None:
+                                            assert fill_val in xdata[
+                                                dvar].values, estr
+                                        else:
+                                            assert np.any(xdata[dvar].values
+                                                          == fill_val), estr
+
+        return
+
+    @pytest.mark.parametrize('dims_equal', [True, False])
+    @pytest.mark.parametrize('exclude_dims', [None, ['time']])
+    def test_expand_xarray_dims(self, dims_equal, exclude_dims):
+        """Test successful padding of dimensions for xarray data.
+
+        Parameters
+        ----------
+        dims_equal : bool
+            If True, the dimension variables for the data sets should be the
+            same; if False they should have different dimensions apart from
+            the 'time' dimension
+        exclude_dims : list-like or NoneType
+            A list of dimensions that have the same name, but can have different
+            values or None if all the dimensions with the same name should
+            have the same shape. (default=None)
+
+        """
+
+        # Set the input parameters
+        self.set_data_meta(dims_equal)
+
+        # Run the dimension expansion
+        self.out = coords.expand_xarray_dims(self.data_list, self.meta,
+                                             dims_equal=dims_equal,
+                                             exclude_dims=exclude_dims)
+
+        # Test the results
+        self.eval_dims(dims_equal, exclude_dims)
+
+        return
+
+    @pytest.mark.parametrize('new_data_type', [int, float, str, bool, None])
+    def test_missing_meta(self, new_data_type):
+        """Test success if variable is missing from meta.
+
+        Parameters
+        ----------
+        new_data_type : type
+            Data type for the new data that will be missing from `self.meta`
+
+        """
+
+        # Set the input parameters
+        self.set_data_meta(True)
+
+        # Add a data variable to one of the data sets
+        self.data_list[1]['new_variable'] = self.data_list[1]['mlt'].astype(
+            new_data_type)
+
+        # Run the dimension expansion
+        self.out = coords.expand_xarray_dims(self.data_list, self.meta,
+                                             dims_equal=True)
+
+        # Test the results
+        fill_val = self.meta.labels.default_values_from_type(
+            self.meta.labels.label_type['fill_val'], new_data_type)
+        self.eval_dims(True, default_fill_val=fill_val)
+
+        return