From cb5434761a1c53d94e9ddcdfbaad5e2964d4053d Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Mon, 26 Aug 2024 19:16:30 -0400 Subject: [PATCH 1/6] add dropna function --- src/lsdb/catalog/catalog.py | 20 ++++++ src/lsdb/catalog/dataset/healpix_dataset.py | 71 +++++++++++++++++++++ 2 files changed, 91 insertions(+) diff --git a/src/lsdb/catalog/catalog.py b/src/lsdb/catalog/catalog.py index 5c00e6bd..2e3e1d28 100644 --- a/src/lsdb/catalog/catalog.py +++ b/src/lsdb/catalog/catalog.py @@ -9,6 +9,9 @@ import pandas as pd from hipscat.catalog.index.index_catalog import IndexCatalog as HCIndexCatalog from hipscat.pixel_math.polygon_filter import SphericalCoordinates +from pandas._libs import lib +from pandas._typing import Axis, AnyAll, IndexLabel +from pandas.api.extensions import no_default from lsdb.catalog.association_catalog import AssociationCatalog from lsdb.catalog.dataset.healpix_dataset import HealpixDataset @@ -506,3 +509,20 @@ def join_nested( ) hc_catalog = hc.catalog.Catalog(new_catalog_info, alignment.pixel_tree) return Catalog(ddf, ddf_map, hc_catalog) + + def dropna( + self, + *, + axis: Axis = 0, + how: AnyAll | lib.NoDefault = no_default, + thresh: int | lib.NoDefault = no_default, + on_nested: bool = False, + subset: IndexLabel | None = None, + ignore_index: bool = False, + ) -> Catalog: + catalog = super().dropna( + axis=axis, how=how, thresh=thresh, on_nested=on_nested, subset=subset, ignore_index=ignore_index + ) + if self.margin is not None: + catalog.margin = self.margin.dropna() + return catalog diff --git a/src/lsdb/catalog/dataset/healpix_dataset.py b/src/lsdb/catalog/dataset/healpix_dataset.py index 56f7b603..0761f033 100644 --- a/src/lsdb/catalog/dataset/healpix_dataset.py +++ b/src/lsdb/catalog/dataset/healpix_dataset.py @@ -17,6 +17,9 @@ from hipscat.inspection.visualize_catalog import get_projection_method from hipscat.pixel_math import HealpixPixel from hipscat.pixel_math.healpix_pixel_function import get_pixel_argsort +from pandas._libs import lib +from pandas._typing import Axis, AnyAll, IndexLabel +from pandas.api.extensions import no_default from typing_extensions import Self from lsdb import io @@ -424,3 +427,71 @@ def to_hipscat( **kwargs: Arguments to pass to the parquet write operations """ io.to_hipscat(self, base_catalog_path, catalog_name, overwrite, storage_options, **kwargs) + + def dropna( + self, + *, + axis: Axis = 0, + how: AnyAll | lib.NoDefault = no_default, + thresh: int | lib.NoDefault = no_default, + on_nested: bool = False, + subset: IndexLabel | None = None, + ignore_index: bool = False, + ) -> Self: # type: ignore[name-defined] # noqa: F821: + """ + Remove missing values for one layer of nested columns in the catalog. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + Determine if rows or columns which contain missing values are + removed. + + * 0, or 'index' : Drop rows which contain missing values. + * 1, or 'columns' : Drop columns which contain missing value. + + Only a single axis is allowed. + + how : {'any', 'all'}, default 'any' + Determine if row or column is removed from catalog, when we have + at least one NA or all NA. + + * 'any' : If any NA values are present, drop that row or column. + * 'all' : If all values are NA, drop that row or column. + thresh : int, optional + Require that many non-NA values. Cannot be combined with how. + on_nested : str or bool, optional + If not False, applies the call to the nested dataframe in the + column with label equal to the provided string. If specified, + the nested dataframe should align with any columns given in + `subset`. + subset : column label or sequence of labels, optional + Labels along other axis to consider, e.g. if you are dropping rows + these would be a list of columns to include. + + Access nested columns using `nested_df.nested_col` (where + `nested_df` refers to a particular nested dataframe and + `nested_col` is a column of that nested dataframe). + ignore_index : bool, default ``False`` + If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 2.0.0 + + Returns + ------- + Catalog + Catalog with NA entries dropped from it. + + Notes + ----- + Operations that target a particular nested structure return a dataframe + with rows of that particular nested structure affected. + + Values for `on_nested` and `subset` should be consistent in pointing + to a single layer, multi-layer operations are not supported at this + time. + """ + ndf = self._ddf.dropna( + axis=axis, how=how, thresh=thresh, on_nested=on_nested, subset=subset, ignore_index=ignore_index + ) + return self.__class__(ndf, self._ddf_pixel_map, self.hc_structure) From 62205f67da7b2001a979315afdfaefc3ee83cd08 Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Mon, 26 Aug 2024 19:32:46 -0400 Subject: [PATCH 2/6] add dropna unit test --- tests/conftest.py | 7 +++++++ tests/lsdb/catalog/test_nested.py | 14 ++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 tests/lsdb/catalog/test_nested.py diff --git a/tests/conftest.py b/tests/conftest.py index 8a002b2c..06240c7c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -201,6 +201,13 @@ def small_sky_order3_source_margin_catalog(test_data_dir): return lsdb.read_hipscat(test_data_dir / SMALL_SKY_ORDER3_SOURCE_MARGIN_NAME) +@pytest.fixture +def small_sky_with_nested_sources(small_sky_order1_catalog, small_sky_order1_source_with_margin): + return small_sky_order1_catalog.join_nested( + small_sky_order1_source_with_margin, left_on="id", right_on="object_id", nested_column_name="sources" + ) + + @pytest.fixture def small_sky_no_metadata_dir(test_data_dir): return test_data_dir / "raw" / SMALL_SKY_NO_METADATA diff --git a/tests/lsdb/catalog/test_nested.py b/tests/lsdb/catalog/test_nested.py new file mode 100644 index 00000000..c1e6237f --- /dev/null +++ b/tests/lsdb/catalog/test_nested.py @@ -0,0 +1,14 @@ +import pandas as pd +import nested_dask as nd +from lsdb import Catalog + + +def test_dropna(small_sky_with_nested_sources): + filtered_cat = small_sky_with_nested_sources.query("sources.mag < 15.1") + drop_na_cat = filtered_cat.dropna() + assert isinstance(drop_na_cat, Catalog) + assert isinstance(drop_na_cat._ddf, nd.NestedFrame) + drop_na_compute = drop_na_cat.compute() + filtered_compute = filtered_cat.compute() + assert len(drop_na_compute) < len(filtered_compute) + pd.testing.assert_frame_equal(drop_na_compute, filtered_compute.dropna()) From 77b485678f50fa46aa6249a8740ccc049188e098 Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Mon, 26 Aug 2024 19:33:47 -0400 Subject: [PATCH 3/6] isort --- src/lsdb/catalog/catalog.py | 2 +- src/lsdb/catalog/dataset/healpix_dataset.py | 2 +- tests/lsdb/catalog/test_nested.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/lsdb/catalog/catalog.py b/src/lsdb/catalog/catalog.py index 2e3e1d28..62fa016f 100644 --- a/src/lsdb/catalog/catalog.py +++ b/src/lsdb/catalog/catalog.py @@ -10,7 +10,7 @@ from hipscat.catalog.index.index_catalog import IndexCatalog as HCIndexCatalog from hipscat.pixel_math.polygon_filter import SphericalCoordinates from pandas._libs import lib -from pandas._typing import Axis, AnyAll, IndexLabel +from pandas._typing import AnyAll, Axis, IndexLabel from pandas.api.extensions import no_default from lsdb.catalog.association_catalog import AssociationCatalog diff --git a/src/lsdb/catalog/dataset/healpix_dataset.py b/src/lsdb/catalog/dataset/healpix_dataset.py index 0761f033..bc54cc6c 100644 --- a/src/lsdb/catalog/dataset/healpix_dataset.py +++ b/src/lsdb/catalog/dataset/healpix_dataset.py @@ -18,7 +18,7 @@ from hipscat.pixel_math import HealpixPixel from hipscat.pixel_math.healpix_pixel_function import get_pixel_argsort from pandas._libs import lib -from pandas._typing import Axis, AnyAll, IndexLabel +from pandas._typing import AnyAll, Axis, IndexLabel from pandas.api.extensions import no_default from typing_extensions import Self diff --git a/tests/lsdb/catalog/test_nested.py b/tests/lsdb/catalog/test_nested.py index c1e6237f..c06ff717 100644 --- a/tests/lsdb/catalog/test_nested.py +++ b/tests/lsdb/catalog/test_nested.py @@ -1,5 +1,6 @@ -import pandas as pd import nested_dask as nd +import pandas as pd + from lsdb import Catalog From c81981c381252c7ec119750c371a2b1748d01a1b Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Mon, 26 Aug 2024 19:36:00 -0400 Subject: [PATCH 4/6] update margin call --- src/lsdb/catalog/catalog.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/lsdb/catalog/catalog.py b/src/lsdb/catalog/catalog.py index 62fa016f..f71108a8 100644 --- a/src/lsdb/catalog/catalog.py +++ b/src/lsdb/catalog/catalog.py @@ -524,5 +524,12 @@ def dropna( axis=axis, how=how, thresh=thresh, on_nested=on_nested, subset=subset, ignore_index=ignore_index ) if self.margin is not None: - catalog.margin = self.margin.dropna() + catalog.margin = self.margin.dropna( + axis=axis, + how=how, + thresh=thresh, + on_nested=on_nested, + subset=subset, + ignore_index=ignore_index, + ) return catalog From eb257f48216456b0805899a26fe190fdaa159a3b Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Thu, 5 Sep 2024 18:17:31 -0400 Subject: [PATCH 5/6] add on_nested test --- tests/lsdb/catalog/test_nested.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/lsdb/catalog/test_nested.py b/tests/lsdb/catalog/test_nested.py index c06ff717..9d732ef2 100644 --- a/tests/lsdb/catalog/test_nested.py +++ b/tests/lsdb/catalog/test_nested.py @@ -1,3 +1,4 @@ +import numpy as np import nested_dask as nd import pandas as pd @@ -13,3 +14,24 @@ def test_dropna(small_sky_with_nested_sources): filtered_compute = filtered_cat.compute() assert len(drop_na_compute) < len(filtered_compute) pd.testing.assert_frame_equal(drop_na_compute, filtered_compute.dropna()) + + +def test_dropna_on_nested(small_sky_with_nested_sources): + def add_na_values_nested(df): + """replaces the first source_ra value in each nested df with NaN""" + for i in range(len(df)): + first_ra_value = df.iloc[i]["sources"].iloc[0]["source_ra"] + df["sources"].array[i] = df["sources"].array[i].replace(first_ra_value, np.NaN) + return df + + filtered_cat = small_sky_with_nested_sources.map_partitions(add_na_values_nested) + drop_na_cat = filtered_cat.dropna(on_nested="sources") + assert isinstance(drop_na_cat, Catalog) + assert isinstance(drop_na_cat._ddf, nd.NestedFrame) + drop_na_sources_compute = drop_na_cat["sources"].compute() + filtered_sources_compute = filtered_cat["sources"].compute() + assert len(drop_na_sources_compute) == len(filtered_sources_compute) + assert sum(map(len, drop_na_sources_compute)) < sum(map(len, filtered_sources_compute)) + pd.testing.assert_frame_equal( + drop_na_cat.compute(), filtered_cat._ddf.dropna(on_nested="sources").compute() + ) From 90e0e72b38071aa21870a9b6f7af23d1ba4e6e2f Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Fri, 6 Sep 2024 14:40:23 -0400 Subject: [PATCH 6/6] fix isort --- tests/lsdb/catalog/test_nested.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/lsdb/catalog/test_nested.py b/tests/lsdb/catalog/test_nested.py index 9d732ef2..c6216dd6 100644 --- a/tests/lsdb/catalog/test_nested.py +++ b/tests/lsdb/catalog/test_nested.py @@ -1,5 +1,5 @@ -import numpy as np import nested_dask as nd +import numpy as np import pandas as pd from lsdb import Catalog