From c745b6d184be8e01d70089d7515809f68c3faf10 Mon Sep 17 00:00:00 2001 From: Sandro Campos Date: Wed, 25 Oct 2023 15:22:43 -0400 Subject: [PATCH 1/5] Add query and assign methods to catalog --- src/lsdb/catalog/catalog.py | 31 +++++++++++++++++++ tests/lsdb/catalog/test_catalog.py | 49 ++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/src/lsdb/catalog/catalog.py b/src/lsdb/catalog/catalog.py index d50616cb..c5512489 100644 --- a/src/lsdb/catalog/catalog.py +++ b/src/lsdb/catalog/catalog.py @@ -91,6 +91,37 @@ def name(self): """The name of the catalog""" return self.hc_structure.catalog_name + def query(self, expr: str | None = None) -> Catalog: + """Filters catalog using a complex query expression + + Args: + expr (str): Query expression to evaluate. The column names that are not + valid Python variables names should be wrapped in backticks, and any + variable values can be injected using f-strings. + + Returns: + A catalog that contains the data from the original catalog that complies + with the query expression + """ + if expr is None: + raise ValueError("Provided query expression is not valid") + ddf = self._ddf.query(expr) + return Catalog(ddf, self._ddf_pixel_map, self.hc_structure) + + def assign(self, **kwargs) -> Catalog: + """Assigns new columns to a catalog + + Args: + **kwargs: Arguments to pass to the assign method. This dictionary + should contain the column names as keys and either a lambda + function or a 1-D Dask array as their corresponding value. + + Returns: + The catalog containing both the old columns and the newly created columns + """ + ddf = self._ddf.assign(**kwargs) + return Catalog(ddf, self._ddf_pixel_map, self.hc_structure) + def crossmatch( self, other: Catalog, diff --git a/tests/lsdb/catalog/test_catalog.py b/tests/lsdb/catalog/test_catalog.py index 5ff9318f..91f6baed 100644 --- a/tests/lsdb/catalog/test_catalog.py +++ b/tests/lsdb/catalog/test_catalog.py @@ -1,5 +1,7 @@ +import dask.array as da import dask.dataframe as dd import pandas as pd +import pytest from hipscat.pixel_math import HealpixPixel @@ -30,3 +32,50 @@ def test_get_catalog_partition_gets_correct_partition(small_sky_order1_catalog): partition_index = small_sky_order1_catalog._ddf_pixel_map[pixel] ddf_partition = small_sky_order1_catalog._ddf.partitions[partition_index] dd.utils.assert_eq(partition, ddf_partition) + + +def test_query_no_arguments(small_sky_order1_catalog): + with pytest.raises(ValueError, match="Provided query expression is not valid"): + small_sky_order1_catalog.query() + + +def test_query(small_sky_order1_catalog): + result_catalog = small_sky_order1_catalog.query("ra > 300 and dec < -50") + expected_ddf = small_sky_order1_catalog._ddf.copy() + expected_ddf = expected_ddf[ + (small_sky_order1_catalog._ddf["ra"] > 300) & (small_sky_order1_catalog._ddf["dec"] < -50) + ] + pd.testing.assert_frame_equal(result_catalog.compute(), expected_ddf.compute()) + + +def test_assign_no_arguments(small_sky_order1_catalog): + result_catalog = small_sky_order1_catalog.assign() + pd.testing.assert_frame_equal(result_catalog._ddf.compute(), small_sky_order1_catalog.compute()) + + +def test_assign_with_callable(small_sky_order1_catalog): + kwargs = {"new_column": lambda x: x["ra"] + x["dec"]} + result_catalog = small_sky_order1_catalog.assign(**kwargs) + expected_ddf = small_sky_order1_catalog._ddf.copy() + expected_ddf["new_column"] = expected_ddf["ra"] + expected_ddf["dec"] + pd.testing.assert_frame_equal(result_catalog.compute(), expected_ddf.compute()) + + +def test_assign_with_series(small_sky_order1_catalog): + new_series = small_sky_order1_catalog._ddf["ra_error"].map(lambda x: x**2) + kwargs = {"new_column": new_series} + result_catalog = small_sky_order1_catalog.assign(**kwargs) + expected_ddf = small_sky_order1_catalog._ddf.copy() + expected_ddf["new_column"] = new_series + pd.testing.assert_frame_equal(result_catalog.compute(), expected_ddf.compute()) + + +def test_assign_with_invalid_arguments(small_sky_order1_catalog): + with pytest.raises(TypeError, match="Column assignment doesn't support type"): + small_sky_order1_catalog.assign(new_column=[1, 2, 3]) + with pytest.raises(ValueError, match="Array assignment only supports 1-D arrays"): + small_sky_order1_catalog.assign(new_column=da.ones((10, 10))) + with pytest.raises(ValueError, match="Number of partitions do not match"): + chunks = small_sky_order1_catalog._ddf.npartitions + 1 + array = da.random.random(size=10, chunks=chunks) + small_sky_order1_catalog.assign(new_column=array) From 7e1630a44c981ad8f15459e1b21cfae6de5d2e39 Mon Sep 17 00:00:00 2001 From: Sandro Campos Date: Wed, 25 Oct 2023 15:53:45 -0400 Subject: [PATCH 2/5] Improve comments on tests --- tests/lsdb/catalog/test_catalog.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/tests/lsdb/catalog/test_catalog.py b/tests/lsdb/catalog/test_catalog.py index 91f6baed..558f361c 100644 --- a/tests/lsdb/catalog/test_catalog.py +++ b/tests/lsdb/catalog/test_catalog.py @@ -1,4 +1,3 @@ -import dask.array as da import dask.dataframe as dd import pandas as pd import pytest @@ -54,28 +53,18 @@ def test_assign_no_arguments(small_sky_order1_catalog): def test_assign_with_callable(small_sky_order1_catalog): - kwargs = {"new_column": lambda x: x["ra"] + x["dec"]} + kwargs = {"new_column": lambda x: x["ra"] ** 2} result_catalog = small_sky_order1_catalog.assign(**kwargs) expected_ddf = small_sky_order1_catalog._ddf.copy() - expected_ddf["new_column"] = expected_ddf["ra"] + expected_ddf["dec"] + expected_ddf["new_column"] = expected_ddf["ra"] ** 2 pd.testing.assert_frame_equal(result_catalog.compute(), expected_ddf.compute()) def test_assign_with_series(small_sky_order1_catalog): + # The series is created from the original dataframe because indices must match new_series = small_sky_order1_catalog._ddf["ra_error"].map(lambda x: x**2) kwargs = {"new_column": new_series} result_catalog = small_sky_order1_catalog.assign(**kwargs) expected_ddf = small_sky_order1_catalog._ddf.copy() expected_ddf["new_column"] = new_series pd.testing.assert_frame_equal(result_catalog.compute(), expected_ddf.compute()) - - -def test_assign_with_invalid_arguments(small_sky_order1_catalog): - with pytest.raises(TypeError, match="Column assignment doesn't support type"): - small_sky_order1_catalog.assign(new_column=[1, 2, 3]) - with pytest.raises(ValueError, match="Array assignment only supports 1-D arrays"): - small_sky_order1_catalog.assign(new_column=da.ones((10, 10))) - with pytest.raises(ValueError, match="Number of partitions do not match"): - chunks = small_sky_order1_catalog._ddf.npartitions + 1 - array = da.random.random(size=10, chunks=chunks) - small_sky_order1_catalog.assign(new_column=array) From 1b82545ab02c313de448c2d4126532a77054b461 Mon Sep 17 00:00:00 2001 From: Sandro Campos Date: Fri, 27 Oct 2023 10:15:41 -0400 Subject: [PATCH 3/5] Add test for multiple column assign --- tests/lsdb/catalog/test_catalog.py | 64 ++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 13 deletions(-) diff --git a/tests/lsdb/catalog/test_catalog.py b/tests/lsdb/catalog/test_catalog.py index 558f361c..d28f57c6 100644 --- a/tests/lsdb/catalog/test_catalog.py +++ b/tests/lsdb/catalog/test_catalog.py @@ -1,3 +1,4 @@ +import dask.array as da import dask.dataframe as dd import pandas as pd import pytest @@ -33,38 +34,75 @@ def test_get_catalog_partition_gets_correct_partition(small_sky_order1_catalog): dd.utils.assert_eq(partition, ddf_partition) -def test_query_no_arguments(small_sky_order1_catalog): - with pytest.raises(ValueError, match="Provided query expression is not valid"): - small_sky_order1_catalog.query() - - def test_query(small_sky_order1_catalog): - result_catalog = small_sky_order1_catalog.query("ra > 300 and dec < -50") expected_ddf = small_sky_order1_catalog._ddf.copy() expected_ddf = expected_ddf[ (small_sky_order1_catalog._ddf["ra"] > 300) & (small_sky_order1_catalog._ddf["dec"] < -50) ] - pd.testing.assert_frame_equal(result_catalog.compute(), expected_ddf.compute()) + # Simple query, with no value injection or backticks + result_catalog = small_sky_order1_catalog.query("ra > 300 and dec < -50") + pd.testing.assert_frame_equal(result_catalog._ddf.compute(), expected_ddf.compute()) + # Query with value injection + ra, dec = 300, -50 + result_catalog = small_sky_order1_catalog.query(f"ra > {ra} and dec < {dec}") + pd.testing.assert_frame_equal(result_catalog._ddf.compute(), expected_ddf.compute()) + # Query with backticks (for invalid Python variables names) + new_columns = {"ra": "right ascension"} + expected_ddf = expected_ddf.rename(columns=new_columns) + small_sky_order1_catalog._ddf = small_sky_order1_catalog._ddf.rename(columns=new_columns) + result_catalog = small_sky_order1_catalog.query("`right ascension` > 300 and dec < -50") + pd.testing.assert_frame_equal(result_catalog._ddf.compute(), expected_ddf.compute()) + + +def test_query_no_arguments(small_sky_order1_catalog): + with pytest.raises(ValueError, match="Provided query expression is not valid"): + small_sky_order1_catalog.query() def test_assign_no_arguments(small_sky_order1_catalog): result_catalog = small_sky_order1_catalog.assign() - pd.testing.assert_frame_equal(result_catalog._ddf.compute(), small_sky_order1_catalog.compute()) + pd.testing.assert_frame_equal(result_catalog._ddf.compute(), small_sky_order1_catalog._ddf.compute()) def test_assign_with_callable(small_sky_order1_catalog): - kwargs = {"new_column": lambda x: x["ra"] ** 2} + kwargs = {"squared_ra_err": lambda x: x["ra_error"] ** 2} result_catalog = small_sky_order1_catalog.assign(**kwargs) expected_ddf = small_sky_order1_catalog._ddf.copy() - expected_ddf["new_column"] = expected_ddf["ra"] ** 2 + expected_ddf["squared_ra_err"] = expected_ddf["ra_error"] ** 2 pd.testing.assert_frame_equal(result_catalog.compute(), expected_ddf.compute()) def test_assign_with_series(small_sky_order1_catalog): # The series is created from the original dataframe because indices must match - new_series = small_sky_order1_catalog._ddf["ra_error"].map(lambda x: x**2) - kwargs = {"new_column": new_series} + squared_ra_err = small_sky_order1_catalog._ddf["ra_error"].map(lambda x: x**2) + kwargs = {"new_column": squared_ra_err} + result_catalog = small_sky_order1_catalog.assign(**kwargs) + expected_ddf = small_sky_order1_catalog._ddf.copy() + expected_ddf["new_column"] = squared_ra_err + pd.testing.assert_frame_equal(result_catalog.compute(), expected_ddf.compute()) + + +def test_assign_with_multiple_columns(small_sky_order1_catalog): + # These series are created from the original dataframe because indices must match + squared_ra_err = small_sky_order1_catalog._ddf["ra_error"].map(lambda x: x**2) + squared_dec_err = small_sky_order1_catalog._ddf["dec_error"].map(lambda x: x**2) + kwargs = { + "squared_ra_err": squared_ra_err, + "squared_dec_err": squared_dec_err, + } result_catalog = small_sky_order1_catalog.assign(**kwargs) expected_ddf = small_sky_order1_catalog._ddf.copy() - expected_ddf["new_column"] = new_series + expected_ddf["squared_ra_err"] = squared_ra_err + expected_ddf["squared_dec_err"] = squared_dec_err pd.testing.assert_frame_equal(result_catalog.compute(), expected_ddf.compute()) + + +def test_assign_with_invalid_arguments(small_sky_order1_catalog): + with pytest.raises(TypeError, match="Column assignment doesn't support type"): + small_sky_order1_catalog.assign(new_column=[1, 2, 3]) + with pytest.raises(ValueError, match="Array assignment only supports 1-D arrays"): + small_sky_order1_catalog.assign(new_column=da.ones((10, 10))) + with pytest.raises(ValueError, match="Number of partitions do not match"): + chunks = small_sky_order1_catalog._ddf.npartitions + 1 + array = da.random.random(size=10, chunks=chunks) + small_sky_order1_catalog.assign(new_column=array) From 31872b7285290123479ebb2e369870bbc256f2d9 Mon Sep 17 00:00:00 2001 From: Sandro Campos Date: Wed, 1 Nov 2023 11:14:27 -0400 Subject: [PATCH 4/5] Apply review changes --- src/lsdb/catalog/catalog.py | 12 ++++++------ tests/lsdb/catalog/test_catalog.py | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/lsdb/catalog/catalog.py b/src/lsdb/catalog/catalog.py index c5512489..fd68cb75 100644 --- a/src/lsdb/catalog/catalog.py +++ b/src/lsdb/catalog/catalog.py @@ -91,20 +91,20 @@ def name(self): """The name of the catalog""" return self.hc_structure.catalog_name - def query(self, expr: str | None = None) -> Catalog: + def query(self, expr: str) -> Catalog: """Filters catalog using a complex query expression Args: - expr (str): Query expression to evaluate. The column names that are not - valid Python variables names should be wrapped in backticks, and any - variable values can be injected using f-strings. + expr (str): Query expression to evaluate. The column names that are not valid Python + variables names should be wrapped in backticks, and any variable values can be + injected using f-strings. The use of '@' to reference variables is not supported. + More information about pandas query strings is available + `here `__. Returns: A catalog that contains the data from the original catalog that complies with the query expression """ - if expr is None: - raise ValueError("Provided query expression is not valid") ddf = self._ddf.query(expr) return Catalog(ddf, self._ddf_pixel_map, self.hc_structure) diff --git a/tests/lsdb/catalog/test_catalog.py b/tests/lsdb/catalog/test_catalog.py index d28f57c6..1cb1acd7 100644 --- a/tests/lsdb/catalog/test_catalog.py +++ b/tests/lsdb/catalog/test_catalog.py @@ -55,8 +55,8 @@ def test_query(small_sky_order1_catalog): def test_query_no_arguments(small_sky_order1_catalog): - with pytest.raises(ValueError, match="Provided query expression is not valid"): - small_sky_order1_catalog.query() + with pytest.raises(ValueError): + small_sky_order1_catalog.query(None) def test_assign_no_arguments(small_sky_order1_catalog): From 9e7f70f324e74f7de8329ef83233b0dd67431bc7 Mon Sep 17 00:00:00 2001 From: Sandro Campos Date: Wed, 1 Nov 2023 11:15:28 -0400 Subject: [PATCH 5/5] Update docstring Co-authored-by: Sean McGuire <123987820+smcguire-cmu@users.noreply.github.com> --- src/lsdb/catalog/catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lsdb/catalog/catalog.py b/src/lsdb/catalog/catalog.py index fd68cb75..23cfa38b 100644 --- a/src/lsdb/catalog/catalog.py +++ b/src/lsdb/catalog/catalog.py @@ -113,7 +113,7 @@ def assign(self, **kwargs) -> Catalog: Args: **kwargs: Arguments to pass to the assign method. This dictionary - should contain the column names as keys and either a lambda + should contain the column names as keys and either a function or a 1-D Dask array as their corresponding value. Returns: