-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from danielfromearth/issue-3
rename duplicate dimensions and refactor attribute handling
- Loading branch information
Showing
7 changed files
with
253 additions
and
100 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
"""Convenience variables used across the package.""" | ||
|
||
GROUP_DELIM = '__' | ||
COORD_DELIM = " " |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
""" | ||
attribute_handling.py | ||
Functions for converting "coordinates" in netCDF variable attributes | ||
between paths that reference a group hierarchy and flattened paths. | ||
""" | ||
import re | ||
|
||
import netCDF4 | ||
|
||
from concatenator import COORD_DELIM, GROUP_DELIM | ||
|
||
|
||
def regroup_coordinate_attribute(attribute_string: str) -> str: | ||
""" | ||
Examples | ||
-------- | ||
>>> coord_att = "__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude" | ||
>>> _flatten_coordinate_attribute(coord_att) | ||
Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude | ||
Parameters | ||
---------- | ||
attribute_string : str | ||
Returns | ||
------- | ||
str | ||
""" | ||
# Use the separator that's in the attribute string only if all separators in the string are the same. | ||
# Otherwise, we will use our own default separator. | ||
whitespaces = re.findall(r'\s+', attribute_string) | ||
if len(set(whitespaces)) <= 1: | ||
new_sep = whitespaces[0] | ||
else: | ||
new_sep = COORD_DELIM | ||
|
||
return new_sep.join( | ||
'/'.join(c.split(GROUP_DELIM))[1:] | ||
for c | ||
in attribute_string.split() # split on any whitespace | ||
) | ||
|
||
|
||
def flatten_coordinate_attribute_paths(dataset: netCDF4.Dataset, | ||
var: netCDF4.Variable, | ||
variable_name: str) -> None: | ||
"""Flatten the paths of variables referenced in the coordinates attribute.""" | ||
if 'coordinates' in var.ncattrs(): | ||
coord_att = var.getncattr('coordinates') | ||
|
||
new_coord_att = _flatten_coordinate_attribute(coord_att) | ||
|
||
dataset.variables[variable_name].setncattr('coordinates', new_coord_att) | ||
|
||
|
||
def _flatten_coordinate_attribute(attribute_string: str) -> str: | ||
"""Converts attributes that specify group membership via "/" to use new group delimiter, even for the root level. | ||
Examples | ||
-------- | ||
>>> coord_att = "Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude" | ||
>>> _flatten_coordinate_attribute(coord_att) | ||
__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude | ||
Parameters | ||
---------- | ||
attribute_string : str | ||
Returns | ||
------- | ||
str | ||
""" | ||
# Use the separator that's in the attribute string only if all separators in the string are the same. | ||
# Otherwise, we will use our own default separator. | ||
whitespaces = re.findall(r'\s+', attribute_string) | ||
if len(set(whitespaces)) <= 1: | ||
new_sep = whitespaces[0] | ||
else: | ||
new_sep = COORD_DELIM | ||
|
||
# A new string is constructed. | ||
return new_sep.join( | ||
f'{GROUP_DELIM}{c.replace("/", GROUP_DELIM)}' | ||
for c | ||
in attribute_string.split() # split on any whitespace | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
""" | ||
dimension_cleanup.py | ||
Functions for renaming duplicated dimension names for netCDF variables, so that xarray can handle the dataset. | ||
""" | ||
import collections | ||
|
||
import netCDF4 as nc | ||
|
||
|
||
def remove_duplicate_dims(nc_dataset: nc.Dataset) -> nc.Dataset: | ||
""" | ||
xarray cannot read netCDF4 datasets with duplicate dimensions. | ||
Function goes through a dataset to catch any variables with duplicate dimensions. | ||
creates an exact copy of the dimension duplicated with a new name. Variable | ||
is reset with new dimensions without duplicates. Old variable deleted, new variable's name | ||
is changed to the original name. | ||
Notes | ||
----- | ||
Requires the dataset to be 'flat', i.e., with no groups and every variable at the root-level. | ||
""" | ||
dup_vars = {} | ||
dup_new_varnames = [] | ||
|
||
for var_name, var in nc_dataset.variables.items(): | ||
dim_list = list(var.dimensions) | ||
if len(set(dim_list)) != len(dim_list): # get true if var.dimensions has a duplicate | ||
dup_vars[var_name] = var # populate dictionary with variables with vars with dup dims | ||
|
||
for dup_var_name, dup_var in dup_vars.items(): | ||
dim_list = list(dup_var.dimensions) # original dimensions of the variable with duplicated dims | ||
|
||
# Dimension(s) that are duplicated are retrieved. | ||
# Note: this is not yet tested for more than one duplicated dimension. | ||
dim_dup = [item for item, count in collections.Counter(dim_list).items() if count > 1][0] | ||
dim_dup_length = dup_var.shape[dup_var.dimensions.index(dim_dup)] # length of the duplicated dimension | ||
|
||
# New dimension and variable names are created. | ||
dim_dup_new = dim_dup+'_1' | ||
var_name_new = dup_var_name+'_1' | ||
dup_new_varnames.append(var_name_new) | ||
|
||
# The last dimension for the variable is replaced with the new name in a temporary list. | ||
new_dim_list = dim_list[:-1] | ||
new_dim_list.extend([dim_dup_new]) | ||
|
||
new_dup_var = {} | ||
|
||
# Attributes for the original variable are retrieved. | ||
attrs_contents = get_attributes_minus_fillvalue_and_renamed_coords(original_var_name=dup_var_name, | ||
new_var_name=dim_dup_new, | ||
original_dataset=nc_dataset) | ||
# for attrname in dup_var.ncattrs(): | ||
# if attrname != '_FillValue': | ||
# contents: str = nc_dataset.variables[dup_var_name].getncattr(attrname) | ||
# if attrname == 'coordinates': | ||
# contents.replace(dim_dup, dim_dup_new) | ||
# | ||
# attrs_contents[attrname] = contents | ||
|
||
fill_value = dup_var._FillValue # pylint: disable=W0212 | ||
|
||
# Only create a new *Dimension* if it doesn't already exist. | ||
if dim_dup_new not in nc_dataset.dimensions.keys(): | ||
|
||
# New dimension is created by copying from the duplicated dimension. | ||
nc_dataset.createDimension(dim_dup_new, dim_dup_length) | ||
|
||
# Only create a new dimension *Variable* if it existed originally in the NetCDF structure. | ||
if dim_dup in nc_dataset.variables.keys(): | ||
|
||
# New variable object is created for the renamed, previously duplicated dimension. | ||
new_dup_var[dim_dup_new] = nc_dataset.createVariable(dim_dup_new, nc_dataset.variables[dim_dup].dtype, | ||
(dim_dup_new,), fill_value=fill_value) | ||
dim_var_attr_contents = get_attributes_minus_fillvalue_and_renamed_coords(original_var_name=dim_dup, | ||
new_var_name=dim_dup_new, | ||
original_dataset=nc_dataset) | ||
for attr_name, contents in dim_var_attr_contents.items(): | ||
new_dup_var[dim_dup_new].setncattr(attr_name, contents) | ||
|
||
new_dup_var[dim_dup_new][:] = nc_dataset.variables[dim_dup][:] | ||
|
||
# Delete existing Variable | ||
del nc_dataset.variables[dup_var_name] | ||
|
||
# Replace original *Variable* with new variable with no duplicated dimensions. | ||
new_dup_var[dup_var_name] = nc_dataset.createVariable(dup_var_name, str(dup_var[:].dtype), | ||
tuple(new_dim_list), fill_value=fill_value) | ||
for attr_name, contents in attrs_contents.items(): | ||
new_dup_var[dup_var_name].setncattr(attr_name, contents) | ||
new_dup_var[dup_var_name][:] = dup_var[:] | ||
|
||
return nc_dataset | ||
|
||
|
||
def get_attributes_minus_fillvalue_and_renamed_coords(original_var_name: str, | ||
new_var_name: str, | ||
original_dataset: nc.Dataset) -> dict: | ||
"""Variable attributes are retrieved.""" | ||
attrs_contents = {} | ||
|
||
for ncattr in original_dataset.variables[original_var_name].ncattrs(): | ||
if ncattr != '_FillValue': | ||
contents: str = original_dataset.variables[original_var_name].getncattr(ncattr) | ||
if ncattr == 'coordinates': | ||
contents.replace(original_var_name, new_var_name) | ||
attrs_contents[ncattr] = contents | ||
|
||
return attrs_contents |
Oops, something went wrong.