diff --git a/cf_pandas/accessor.py b/cf_pandas/accessor.py index 6b9679f..2b2fcac 100644 --- a/cf_pandas/accessor.py +++ b/cf_pandas/accessor.py @@ -3,6 +3,7 @@ """ import itertools +from collections import ChainMap from typing import ( Any, Callable, @@ -25,9 +26,14 @@ import cf_pandas as cfp -from .criteria import coordinate_criteria +from .criteria import coordinate_criteria, guess_regex from .options import OPTIONS -from .utils import always_iterable, match_criteria_key, set_up_criteria +from .utils import ( + _is_datetime_like, + always_iterable, + match_criteria_key, + set_up_criteria, +) from .vocab import Vocab #: `axis` names understood by cf_xarray @@ -195,7 +201,6 @@ def axes(self) -> Dict[str, List[str]]: """ # vardict = {key: self.__getitem__(key) for key in _AXIS_NAMES} vardict = {key: _get_all(self._obj, key) for key in _AXIS_NAMES} - return {k: sorted(v) for k, v in vardict.items() if v} @property @@ -275,16 +280,20 @@ def standard_names(self): def _get_axis_coord(obj: Union[DataFrame, Series], key: str) -> list: """ - Translate from axis or coord name to variable name + Translate from axis or coord name to variable name. After matching based on coordinate_criteria, + if there are no matches for key, then guess_regex is used to search for matches. + Parameters ---------- obj : DataArray, Dataset DataArray belonging to the coordinate to be checked key : str, ["X", "Y", "Z", "T", "longitude", "latitude", "vertical", "time"] key to check for. + Returns ------- List[str], Variable name(s) in parent xarray object that matches axis or coordinate `key` + Notes ----- This functions checks for the following attributes in order @@ -292,6 +301,7 @@ def _get_axis_coord(obj: Union[DataFrame, Series], key: str) -> list: - `_CoordinateAxisType` (from THREDDS) - `axis` (CF option) - `positive` (CF standard for non-pressure vertical coordinate) + References ---------- MetPy's parse_cf @@ -340,6 +350,18 @@ def _get_axis_coord(obj: Union[DataFrame, Series], key: str) -> list: # units = getattr(col.data, "units", None) # if units in expected: # results.update((col,)) + + # also use the guess_regex approach by default, but only if no results so far + # this takes the logic from cf-xarray guess_coord_axis + if len(results) == 0: + if obj[col].ndim == 1 and _is_datetime_like(obj[col]): + results.update((col,)) + continue # prevent second detection + + pattern = guess_regex[key] + if pattern.match(col.lower()): + results.update((col,)) + return list(results) diff --git a/cf_pandas/utils.py b/cf_pandas/utils.py index 0cbb039..5f637c5 100644 --- a/cf_pandas/utils.py +++ b/cf_pandas/utils.py @@ -5,8 +5,10 @@ from collections import ChainMap from typing import Any, Iterable, Optional, Union +import numpy as np import pandas as pd import regex +from pandas import Series from .options import OPTIONS @@ -152,3 +154,12 @@ def standard_names(): standard_names = [entry.get("id") for entry in soup.find_all("entry")] return standard_names + + +def _is_datetime_like(da: Series) -> bool: + if np.issubdtype(da.dtype, np.datetime64) or np.issubdtype( + da.dtype, np.timedelta64 + ): + return True + + return False diff --git a/tests/test_accessor.py b/tests/test_accessor.py index e594dac..6ce4f67 100644 --- a/tests/test_accessor.py +++ b/tests/test_accessor.py @@ -121,3 +121,10 @@ def test_set_item(): assert all(df.cf["temp"].values == np.arange(8)) df.cf["longitude"] = np.arange(8) assert all(df.cf["longitude"].values == np.arange(8)) + + +def test_get_by_guess_regex(): + df = pd.DataFrame(columns=["lon", "lat", "min"]) + assert df.cf["longitude"].name == "lon" + assert df.cf["latitude"].name == "lat" + assert df.cf["time"].name == "min" diff --git a/tests/test_utils.py b/tests/test_utils.py index 3b63842..ec92e37 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,6 +2,7 @@ from unittest import mock +import pandas as pd import requests import cf_pandas as cfp @@ -49,3 +50,14 @@ def test_standard_names(mock_requests): mock_requests.return_value = resp names = cfp.standard_names() assert "wind_speed" in names + + +def test__is_datetime_like(): + df = pd.DataFrame() + df["time"] = pd.date_range(start="2001-1-1", end="2001-1-5", freq="1D") + assert cfp.utils._is_datetime_like(df["time"]) + + df = pd.DataFrame() + df["time"] = ["2001-1-1", "2001-1-2", "2001-1-3"] + assert not cfp.utils._is_datetime_like(df["time"]) + assert cfp.utils._is_datetime_like(pd.to_datetime(df["time"]))