Skip to content

Commit

Permalink
Merge pull request #46 from astronomy-commons/sandro/catalog-query-an…
Browse files Browse the repository at this point in the history
…d-assign

Wrappers for Dataframe query and assign operations
  • Loading branch information
camposandro authored Nov 1, 2023
2 parents d514dfe + c2fe3e3 commit d9e03f7
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 0 deletions.
31 changes: 31 additions & 0 deletions src/lsdb/catalog/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,37 @@ def name(self):
"""The name of the catalog"""
return self.hc_structure.catalog_name

def query(self, expr: str) -> Catalog:
"""Filters catalog using a complex query expression
Args:
expr (str): Query expression to evaluate. The column names that are not valid Python
variables names should be wrapped in backticks, and any variable values can be
injected using f-strings. The use of '@' to reference variables is not supported.
More information about pandas query strings is available
`here <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html>`__.
Returns:
A catalog that contains the data from the original catalog that complies
with the query expression
"""
ddf = self._ddf.query(expr)
return Catalog(ddf, self._ddf_pixel_map, self.hc_structure)

def assign(self, **kwargs) -> Catalog:
"""Assigns new columns to a catalog
Args:
**kwargs: Arguments to pass to the assign method. This dictionary
should contain the column names as keys and either a
function or a 1-D Dask array as their corresponding value.
Returns:
The catalog containing both the old columns and the newly created columns
"""
ddf = self._ddf.assign(**kwargs)
return Catalog(ddf, self._ddf_pixel_map, self.hc_structure)

def crossmatch(
self,
other: Catalog,
Expand Down
76 changes: 76 additions & 0 deletions tests/lsdb/catalog/test_catalog.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import dask.array as da
import dask.dataframe as dd
import pandas as pd
import pytest
from hipscat.pixel_math import HealpixPixel


Expand Down Expand Up @@ -30,3 +32,77 @@ def test_get_catalog_partition_gets_correct_partition(small_sky_order1_catalog):
partition_index = small_sky_order1_catalog._ddf_pixel_map[pixel]
ddf_partition = small_sky_order1_catalog._ddf.partitions[partition_index]
dd.utils.assert_eq(partition, ddf_partition)


def test_query(small_sky_order1_catalog):
expected_ddf = small_sky_order1_catalog._ddf.copy()
expected_ddf = expected_ddf[
(small_sky_order1_catalog._ddf["ra"] > 300) & (small_sky_order1_catalog._ddf["dec"] < -50)
]
# Simple query, with no value injection or backticks
result_catalog = small_sky_order1_catalog.query("ra > 300 and dec < -50")
pd.testing.assert_frame_equal(result_catalog._ddf.compute(), expected_ddf.compute())
# Query with value injection
ra, dec = 300, -50
result_catalog = small_sky_order1_catalog.query(f"ra > {ra} and dec < {dec}")
pd.testing.assert_frame_equal(result_catalog._ddf.compute(), expected_ddf.compute())
# Query with backticks (for invalid Python variables names)
new_columns = {"ra": "right ascension"}
expected_ddf = expected_ddf.rename(columns=new_columns)
small_sky_order1_catalog._ddf = small_sky_order1_catalog._ddf.rename(columns=new_columns)
result_catalog = small_sky_order1_catalog.query("`right ascension` > 300 and dec < -50")
pd.testing.assert_frame_equal(result_catalog._ddf.compute(), expected_ddf.compute())


def test_query_no_arguments(small_sky_order1_catalog):
with pytest.raises(ValueError):
small_sky_order1_catalog.query(None)


def test_assign_no_arguments(small_sky_order1_catalog):
result_catalog = small_sky_order1_catalog.assign()
pd.testing.assert_frame_equal(result_catalog._ddf.compute(), small_sky_order1_catalog._ddf.compute())


def test_assign_with_callable(small_sky_order1_catalog):
kwargs = {"squared_ra_err": lambda x: x["ra_error"] ** 2}
result_catalog = small_sky_order1_catalog.assign(**kwargs)
expected_ddf = small_sky_order1_catalog._ddf.copy()
expected_ddf["squared_ra_err"] = expected_ddf["ra_error"] ** 2
pd.testing.assert_frame_equal(result_catalog.compute(), expected_ddf.compute())


def test_assign_with_series(small_sky_order1_catalog):
# The series is created from the original dataframe because indices must match
squared_ra_err = small_sky_order1_catalog._ddf["ra_error"].map(lambda x: x**2)
kwargs = {"new_column": squared_ra_err}
result_catalog = small_sky_order1_catalog.assign(**kwargs)
expected_ddf = small_sky_order1_catalog._ddf.copy()
expected_ddf["new_column"] = squared_ra_err
pd.testing.assert_frame_equal(result_catalog.compute(), expected_ddf.compute())


def test_assign_with_multiple_columns(small_sky_order1_catalog):
# These series are created from the original dataframe because indices must match
squared_ra_err = small_sky_order1_catalog._ddf["ra_error"].map(lambda x: x**2)
squared_dec_err = small_sky_order1_catalog._ddf["dec_error"].map(lambda x: x**2)
kwargs = {
"squared_ra_err": squared_ra_err,
"squared_dec_err": squared_dec_err,
}
result_catalog = small_sky_order1_catalog.assign(**kwargs)
expected_ddf = small_sky_order1_catalog._ddf.copy()
expected_ddf["squared_ra_err"] = squared_ra_err
expected_ddf["squared_dec_err"] = squared_dec_err
pd.testing.assert_frame_equal(result_catalog.compute(), expected_ddf.compute())


def test_assign_with_invalid_arguments(small_sky_order1_catalog):
with pytest.raises(TypeError, match="Column assignment doesn't support type"):
small_sky_order1_catalog.assign(new_column=[1, 2, 3])
with pytest.raises(ValueError, match="Array assignment only supports 1-D arrays"):
small_sky_order1_catalog.assign(new_column=da.ones((10, 10)))
with pytest.raises(ValueError, match="Number of partitions do not match"):
chunks = small_sky_order1_catalog._ddf.npartitions + 1
array = da.random.random(size=10, chunks=chunks)
small_sky_order1_catalog.assign(new_column=array)

0 comments on commit d9e03f7

Please sign in to comment.