Skip to content

Commit

Permalink
Merge pull request #61 from astronomy-commons/issue/50/catalog-saving
Browse files Browse the repository at this point in the history
Save catalog to disk
  • Loading branch information
camposandro authored Nov 13, 2023
2 parents aac09ca + 76e6411 commit 03e93be
Show file tree
Hide file tree
Showing 26 changed files with 672 additions and 156 deletions.
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,4 @@ Notes:

Home page <self>
API Reference <autoapi/index>
Notebooks <notebooks>
7 changes: 7 additions & 0 deletions docs/notebooks.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Notebooks
========================================================================================

.. toctree::
:maxdepth: 1

Import catalogs <notebooks/import_catalogs>
304 changes: 304 additions & 0 deletions docs/notebooks/import_catalogs.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,304 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "26685ece166fa3f7",
"metadata": {
"collapsed": false
},
"source": [
"# Importing catalogs to HiPSCat format\n",
"\n",
"This notebook presents two ways of importing catalogs to HiPSCat format. The first uses the __lsdb.from_dataframe()__ method, which is helpful to load smaller catalogs from a single dataframe, while the second uses the __hipscat import pipeline__."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c2de4ed424644058",
"metadata": {
"ExecuteTime": {
"end_time": "2023-11-10T15:14:55.941948Z",
"start_time": "2023-11-10T15:14:55.912372Z"
}
},
"outputs": [],
"source": [
"import lsdb\n",
"import os\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b5f76fe439a32b9",
"metadata": {
"ExecuteTime": {
"end_time": "2023-11-10T15:14:55.946920Z",
"start_time": "2023-11-10T15:14:55.920402Z"
}
},
"outputs": [],
"source": [
"catalog_name = \"small_sky_order1\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "59e857888ae9ea70",
"metadata": {
"ExecuteTime": {
"end_time": "2023-11-10T15:14:55.958272Z",
"start_time": "2023-11-10T15:14:55.924340Z"
}
},
"outputs": [],
"source": [
"# Input paths\n",
"test_data_dir = os.path.join(\"../../tests\", \"data\")\n",
"catalog_dir = os.path.join(test_data_dir, catalog_name)\n",
"catalog_csv_path = os.path.join(catalog_dir, f\"{catalog_name}.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "263e7a037919a20b",
"metadata": {
"ExecuteTime": {
"end_time": "2023-11-10T15:14:55.958815Z",
"start_time": "2023-11-10T15:14:55.927288Z"
}
},
"outputs": [],
"source": [
"# Output paths\n",
"catalog_from_dataframe = f\"{catalog_name}-from_dataframe\"\n",
"catalog_from_importer = f\"{catalog_name}-from_importer\""
]
},
{
"cell_type": "markdown",
"id": "b5e74432b3437e2f",
"metadata": {
"collapsed": false
},
"source": [
"## Using lsdb.from_dataframe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "initial_id",
"metadata": {
"ExecuteTime": {
"end_time": "2023-11-10T15:14:55.987313Z",
"start_time": "2023-11-10T15:14:55.934107Z"
}
},
"outputs": [],
"source": [
"%%time\n",
"\n",
"# Read simple catalog from its CSV file\n",
"catalog = lsdb.from_dataframe(\n",
" pd.read_csv(catalog_csv_path),\n",
" catalog_name=catalog_from_dataframe,\n",
" catalog_type=\"object\",\n",
" highest_order=5,\n",
" threshold=100\n",
")\n",
"\n",
"# Save it to disk in HiPSCat format\n",
"catalog.to_hipscat(catalog_from_dataframe)"
]
},
{
"cell_type": "markdown",
"id": "bc1cc2a7eac29dba",
"metadata": {
"collapsed": false
},
"source": [
"## Using the import pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "46bca7773cf6b261",
"metadata": {
"ExecuteTime": {
"end_time": "2023-11-10T15:14:57.130701Z",
"start_time": "2023-11-10T15:14:55.985757Z"
}
},
"outputs": [],
"source": [
"# Install hipscat-import\n",
"!pip install hipscat-import --quiet"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d2d1324d21d62c81",
"metadata": {
"ExecuteTime": {
"end_time": "2023-11-10T15:14:57.134432Z",
"start_time": "2023-11-10T15:14:57.131884Z"
}
},
"outputs": [],
"source": [
"from dask.distributed import Client\n",
"from hipscat_import.catalog.arguments import ImportArguments\n",
"from hipscat_import.pipeline import pipeline_with_client"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d9be67178e901acc",
"metadata": {
"ExecuteTime": {
"end_time": "2023-11-10T15:14:57.136858Z",
"start_time": "2023-11-10T15:14:57.134937Z"
}
},
"outputs": [],
"source": [
"# Create directory if it does not yet exist\n",
"os.makedirs(catalog_from_importer, exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "59d1538d8545265",
"metadata": {
"ExecuteTime": {
"end_time": "2023-11-10T15:14:57.172913Z",
"start_time": "2023-11-10T15:14:57.138071Z"
}
},
"outputs": [],
"source": [
"args = ImportArguments(\n",
" id_column=\"id\",\n",
" ra_column=\"ra\",\n",
" dec_column=\"dec\",\n",
" highest_healpix_order=5,\n",
" pixel_threshold=100,\n",
" input_path=catalog_dir,\n",
" input_format=f\"small_sky_order1.csv\",\n",
" output_catalog_name=catalog_from_importer,\n",
" output_path=\".\",\n",
" dask_tmp=\".\",\n",
" overwrite=True,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "300eb3d3fcbb7b62",
"metadata": {
"ExecuteTime": {
"end_time": "2023-11-10T15:15:01.266753Z",
"start_time": "2023-11-10T15:14:57.156033Z"
}
},
"outputs": [],
"source": [
"%%time\n",
"with Client(local_directory=args.dask_tmp, n_workers=4) as client:\n",
" pipeline_with_client(args, client)"
]
},
{
"cell_type": "markdown",
"id": "5cffa1283ddbca39",
"metadata": {
"collapsed": false
},
"source": [
"### Load both catalogs and check that they are equivalent"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "620ff4e2b7ae291b",
"metadata": {
"ExecuteTime": {
"end_time": "2023-11-10T15:15:01.288751Z",
"start_time": "2023-11-10T15:15:01.264910Z"
}
},
"outputs": [],
"source": [
"from_dataframe_catalog = lsdb.read_hipscat(catalog_from_dataframe)\n",
"from_dataframe_catalog._ddf"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "575daf21cd4cfcad",
"metadata": {
"ExecuteTime": {
"end_time": "2023-11-10T15:15:01.313421Z",
"start_time": "2023-11-10T15:15:01.279523Z"
}
},
"outputs": [],
"source": [
"from_importer_catalog = lsdb.read_hipscat(catalog_from_importer)\n",
"from_importer_catalog._ddf"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b9d176a8dc303aa0",
"metadata": {
"ExecuteTime": {
"end_time": "2023-11-10T15:15:01.334413Z",
"start_time": "2023-11-10T15:15:01.289748Z"
}
},
"outputs": [],
"source": [
"# Verify that pixels are similar\n",
"assert from_dataframe_catalog.get_healpix_pixels() == from_importer_catalog.get_healpix_pixels()\n",
"# Verify that resulting dataframes contain the same data\n",
"pd.testing.assert_frame_equal(from_dataframe_catalog.compute().sort_index(), from_importer_catalog.compute().sort_index(), check_dtype=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
24 changes: 20 additions & 4 deletions src/lsdb/catalog/catalog.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
from __future__ import annotations

import dataclasses
from typing import Dict, List, Tuple, Type, cast
from typing import Any, Dict, List, Tuple, Type, Union, cast

import dask.dataframe as dd
import hipscat as hc
from hipscat.pixel_math import HealpixPixel

from lsdb import io
from lsdb.catalog.dataset.dataset import Dataset
from lsdb.core.cone_search import cone_filter
from lsdb.core.crossmatch.abstract_crossmatch_algorithm import AbstractCrossmatchAlgorithm
from lsdb.core.crossmatch.crossmatch_algorithms import BuiltInCrossmatchAlgorithm
from lsdb.dask.crossmatch_catalog_data import crossmatch_catalog_data

DaskDFPixelMap = Dict[HealpixPixel, int]
from lsdb.types import DaskDFPixelMap


# pylint: disable=R0903, W0212
Expand All @@ -25,7 +25,6 @@ class Catalog(Dataset):
hc_structure: `hipscat.Catalog` object representing the structure
and metadata of the HiPSCat catalog
"""

hc_structure: hc.catalog.Catalog

def __init__(
Expand Down Expand Up @@ -247,3 +246,20 @@ def cone_search(self, ra: float, dec: float, radius: float):
cone_search_ddf = cast(dd.DataFrame, cone_search_ddf)
ddf_partition_map = {pixel: i for i, pixel in enumerate(pixels_in_cone)}
return Catalog(cone_search_ddf, ddf_partition_map, filtered_hc_structure)

def to_hipscat(
self,
base_catalog_path: str,
catalog_name: Union[str, None] = None,
storage_options: Union[Dict[Any, Any], None] = None,
**kwargs
):
"""Saves the catalog to disk in HiPSCat format
Args:
base_catalog_path (str): Location where catalog is saved to
catalog_name (str): The name of the catalog to be saved
storage_options (dict): Dictionary that contains abstract filesystem credentials
**kwargs: Arguments to pass to the parquet write operations
"""
io.to_hipscat(self, base_catalog_path, catalog_name, storage_options, **kwargs)
3 changes: 2 additions & 1 deletion src/lsdb/dask/crossmatch_catalog_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@
from lsdb.core.crossmatch.abstract_crossmatch_algorithm import AbstractCrossmatchAlgorithm
from lsdb.core.crossmatch.crossmatch_algorithms import BuiltInCrossmatchAlgorithm
from lsdb.core.crossmatch.kdtree_match import KdTreeCrossmatch
from lsdb.types import DaskDFPixelMap

if TYPE_CHECKING:
from lsdb.catalog.catalog import Catalog, DaskDFPixelMap
from lsdb.catalog.catalog import Catalog

builtin_crossmatch_algorithms = {BuiltInCrossmatchAlgorithm.KD_TREE: KdTreeCrossmatch}

Expand Down
1 change: 1 addition & 0 deletions src/lsdb/io/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .to_hipscat import to_hipscat
Loading

0 comments on commit 03e93be

Please sign in to comment.