Skip to content

Commit

Permalink
Merge pull request pysat#1123 from pysat/expand_dims
Browse files Browse the repository at this point in the history
Expand xarray dims when concatenating data
  • Loading branch information
aburrell authored May 31, 2023
2 parents 0d66579 + 1322c4d commit 6a9ecc7
Show file tree
Hide file tree
Showing 6 changed files with 344 additions and 17 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ This project adheres to [Semantic Versioning](https://semver.org/).
[3.1.0] - 2023-06-02
--------------------
* New Features
* Added a utility to ensure two xarray Datasets can be concatonated, and
incorporated this utility into the Instrument file loading
* Added unit tests for different file cadences in the Instrument class
* Added `to_inst` method to the Constellation class
* Added `export_pysat_info` kwarg to `to_netcdf` routines to select whether
Expand Down
15 changes: 15 additions & 0 deletions pysat/_instrument.py
Original file line number Diff line number Diff line change
Expand Up @@ -2389,9 +2389,24 @@ def concat_data(self, new_data, prepend=False, **kwargs):
kwargs['sort'] = False
concat_func = pds.concat
else:
# Ensure the dimensions are equal
equal_dims = True
idat = 0
while idat < len(new_data) - 1 and equal_dims:
if new_data[idat].dims != new_data[idat + 1].dims:
equal_dims = False
idat += 1

if not equal_dims:
# Update the dimensions, padding data where necessary
new_data = pysat.utils.coords.expand_xarray_dims(
new_data, self.meta, exclude_dims=['time'])

# Specify the dimension, if not otherwise specified
if 'dim' not in kwargs:
kwargs['dim'] = self.index.name

# Set the concat function
concat_func = xr.concat

# Assign the concatenated data to the instrument
Expand Down
38 changes: 28 additions & 10 deletions pysat/_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,8 @@ def __setitem__(self, data_vars, input_dat):
# If this is a disagreement between byte data
# and an expected str, resolve it here
if(isinstance(to_be_set, bytes)
and self.labels.label_type[iattr] == str):
and str in pysat.utils.listify(
self.labels.label_type[iattr])):
to_be_set = core_utils.stringify(to_be_set)
else:
# This type is incorrect, try casting it
Expand All @@ -440,16 +441,18 @@ def __setitem__(self, data_vars, input_dat):
iattr])])
try:
if hasattr(to_be_set, '__iter__'):
if self.labels.label_type[
iattr] == str:
if str in pysat.utils.listify(
self.labels.label_type[
iattr]):
to_be_set = '\n\n'.join(
[str(tval) for tval in
to_be_set])
else:
raise TypeError("can't recast")
else:
to_be_set = self.labels.label_type[
iattr](to_be_set)
to_be_set = pysat.utils.listify(
self.labels.label_type[
iattr])[0](to_be_set)

# Inform user data was recast
pysat.logger.info(''.join((
Expand Down Expand Up @@ -838,17 +841,22 @@ def _insert_default_values(self, data_var, data_type=None):
var_types = pysat.utils.listify(data_type)

for i, var in enumerate(data_vars):
if name_idx is not None:
default_vals[name_idx] = var

# Use the label defaults if this variable doesn't need to consider
# the data type
if not np.any(list(need_data_type.values())):
self._data.loc[var, labels] = default_vals
data_default = list(default_vals)
else:
data_default = [
self.labels.default_values_from_attr(
lattrs[j], var_types[i]) if need_data_type[lattrs[j]]
else val for j, val in enumerate(default_vals)]
self._data.loc[var, labels] = data_default

# The default value for the name must be set after to be consistent
if name_idx is not None:
data_default[name_idx] = var

# Update the meta data to the desired defaults
self._data.loc[var, labels] = data_default

return

Expand Down Expand Up @@ -1853,6 +1861,10 @@ def _update_label_types(self):
elif self.label_type[lkey] == int:
self.label_type[lkey] = (int, np.int64, np.int32, np.int16,
np.int8, bool)
elif self.label_type[lkey] == str:
self.label_type[lkey] = (str, np.str_)
elif self.label_type[lkey] == bool:
self.label_type[lkey] = (bool, np.bool_)
elif isinstance(self.label_type[lkey], tuple):
ltypes = list(self.label_type[lkey])

Expand All @@ -1862,6 +1874,12 @@ def _update_label_types(self):
if int in ltypes:
ltypes.extend([np.int64, np.int32, np.int16, np.int8, bool])

if str in ltypes:
ltypes.append(np.str_)

if bool in ltypes:
ltypes.append(np.bool_)

# This may result in duplicate numpy types, but order is more
# important than carrying around a duplicate type, as the first
# type in the provided tuple is the default type
Expand Down
15 changes: 8 additions & 7 deletions pysat/tests/test_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,8 +411,9 @@ def test_set_meta_with_wrong_type_cast(self, bad_val, caplog):

# Test the warning
captured = caplog.text
assert captured.find('Metadata with type') >= 0
assert captured.find('Recasting input') >= 0
estr = "missing expected message in: {:}".format(captured)
assert captured.find('Metadata with type') >= 0, estr
assert captured.find('Recasting input') >= 0, estr

# Check that meta is set
if hasattr(bad_val, "__iter__"):
Expand Down Expand Up @@ -738,8 +739,8 @@ def test_meta_assignment(self, custom_attr, assign_type):
self.dval = 'test_meta_dict_assignment'
self.default_val = {
getattr(self.meta.labels, mattr): ' '.join(['test', mattr])
if self.meta.labels.label_type[mattr] == str else -47
for mattr in self.meta.labels.label_type.keys()}
if str in pysat.utils.listify(self.meta.labels.label_type[mattr])
else -47 for mattr in self.meta.labels.label_type.keys()}
self.default_name = []
self.default_nan = []

Expand Down Expand Up @@ -771,9 +772,9 @@ def test_multiple_meta_assignment(self, custom_attr, assign_type):
dvals = ['mult1', 'mult2']
default_vals = {
getattr(self.meta.labels, mattr): [
' '.join(['test', mattr, self.dval])
if self.meta.labels.label_type[mattr] == str else -47
for self.dval in dvals]
' '.join(['test', mattr, self.dval]) if str
in pysat.utils.listify(self.meta.labels.label_type[mattr])
else -47 for self.dval in dvals]
for mattr in self.meta.labels.label_type.keys()}
self.default_name = []
self.default_nan = []
Expand Down
191 changes: 191 additions & 0 deletions pysat/tests/test_utils_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,3 +359,194 @@ def test_establish_common_coord_single_val_only(self):
assert self.short_coord[0] == out[0], "unexpected value"
assert len(out) == 1, "unexpected coordinate length"
return


class TestExpandXarrayDims(object):
"""Unit tests for the `expand_xarray_dims` function."""

def setup_method(self):
"""Set up the unit test environment."""
self.test_inst = pysat.Instrument(
inst_module=pysat.instruments.pysat_ndtesting, use_header=True)
self.start_time = pysat.instruments.pysat_ndtesting._test_dates['']['']
self.data_list = []
self.out = None
self.meta = None
return

def teardown_method(self):
"""Clean up the unit test environment."""
del self.test_inst, self.start_time, self.data_list, self.meta, self.out
return

def set_data_meta(self, dims_equal):
"""Set the input data list and meta data.
Parameters
----------
dims_equal : bool
If True, the dimension variables for the data sets should be the
same; if False they should have different dimensions apart from
the 'time' dimension
"""

self.test_inst.load(date=self.start_time)
self.data_list.append(self.test_inst.data)
self.meta = self.test_inst.meta

# The second data set should have half the time samples
num_samples = int(self.test_inst.index.shape[0] / 2)

if dims_equal:
# Load a second data set with half the time samples
self.test_inst = pysat.Instrument(
inst_module=self.test_inst.inst_module,
num_samples=num_samples, use_header=True)
else:
# Load a second data set with different dimensions apart from time
self.test_inst = pysat.Instrument(
inst_module=pysat.instruments.pysat_testmodel,
num_samples=num_samples, use_header=True)

self.test_inst.load(date=self.start_time + dt.timedelta(days=1))
self.data_list.append(self.test_inst.data)

return

def eval_dims(self, dims_equal, exclude_dims=None, default_fill_val=None):
"""Set the input data list and meta data.
Parameters
----------
dims_equal : bool
If True, the dimension variables for the data sets should be the
same; if False they should have different dimensions apart from
the 'time' dimension
exclude_dims : list-like or NoneType
A list of dimensions that have the same name, but can have different
values or None if all the dimensions with the same name should
have the same shape. (default=None)
default_fill_val : any
The expected fill value for data variables not present in self.meta
(default=None)
"""
if exclude_dims is None:
exclude_dims = []

# Define the reference Dataset
ref_dims = list(self.out[0].dims.keys())

# Cycle through the remaining Datasets
for i, xdata in enumerate(self.out[1:]):
test_dims = list(xdata.dims.keys())

# Test that the expected dimension names overlap between datasets
if dims_equal:
testing.assert_lists_equal(test_dims, ref_dims)
else:
for tdim in test_dims:
assert (tdim == 'time' if tdim in ref_dims else tdim
!= 'time'), "unexpected dimension: {:}".format(tdim)

# Test the dimensions shapes for expected (lack of) differences
for tdim in test_dims:
if tdim in ref_dims:
if tdim in exclude_dims:
assert xdata[tdim].shape != self.out[0][tdim].shape
else:
assert xdata[tdim].shape == self.out[0][tdim].shape

if xdata[tdim].shape != self.data_list[
i + 1][tdim].shape:
# This data set is smaller, test for fill values
for dvar in xdata.data_vars.keys():
if tdim in xdata[dvar].dims:
if dvar in self.meta:
fill_val = self.meta[
dvar, self.meta.labels.fill_val]
else:
fill_val = default_fill_val

try:
if np.isnan(fill_val):
assert np.isnan(
xdata[dvar].values).any()
else:
assert np.any(xdata[dvar].values
== fill_val)
except TypeError:
# This is a string or object
estr = "".join([
"Bad or missing fill values for ",
dvar, ": ({:} not in {:})".format(
fill_val, xdata[dvar].values)])
if fill_val is None:
assert fill_val in xdata[
dvar].values, estr
else:
assert np.any(xdata[dvar].values
== fill_val), estr

return

@pytest.mark.parametrize('dims_equal', [True, False])
@pytest.mark.parametrize('exclude_dims', [None, ['time']])
def test_expand_xarray_dims(self, dims_equal, exclude_dims):
"""Test successful padding of dimensions for xarray data.
Parameters
----------
dims_equal : bool
If True, the dimension variables for the data sets should be the
same; if False they should have different dimensions apart from
the 'time' dimension
exclude_dims : list-like or NoneType
A list of dimensions that have the same name, but can have different
values or None if all the dimensions with the same name should
have the same shape. (default=None)
"""

# Set the input parameters
self.set_data_meta(dims_equal)

# Run the dimension expansion
self.out = coords.expand_xarray_dims(self.data_list, self.meta,
dims_equal=dims_equal,
exclude_dims=exclude_dims)

# Test the results
self.eval_dims(dims_equal, exclude_dims)

return

@pytest.mark.parametrize('new_data_type', [int, float, str, bool, None])
def test_missing_meta(self, new_data_type):
"""Test success if variable is missing from meta.
Parameters
----------
new_data_type : type
Data type for the new data that will be missing from `self.meta`
"""

# Set the input parameters
self.set_data_meta(True)

# Add a data variable to one of the data sets
self.data_list[1]['new_variable'] = self.data_list[1]['mlt'].astype(
new_data_type)

# Run the dimension expansion
self.out = coords.expand_xarray_dims(self.data_list, self.meta,
dims_equal=True)

# Test the results
fill_val = self.meta.labels.default_values_from_type(
self.meta.labels.label_type['fill_val'], new_data_type)
self.eval_dims(True, default_fill_val=fill_val)

return
Loading

0 comments on commit 6a9ecc7

Please sign in to comment.