Merge pull request #61 from astronomy-commons/issue/50/catalog-saving

Save catalog to disk
astronomy-commons · Nov 13, 2023 · 03e93be · 03e93be
2 parents aac09ca + 76e6411
commit 03e93be
Show file tree

Hide file tree

Showing 26 changed files with 672 additions and 156 deletions.
diff --git a/docs/index.rst b/docs/index.rst
@@ -47,3 +47,4 @@ Notes:
 
    Home page <self>
    API Reference <autoapi/index>
+   Notebooks <notebooks>
diff --git a/docs/notebooks.rst b/docs/notebooks.rst
@@ -0,0 +1,7 @@
+Notebooks
+========================================================================================
+
+.. toctree::
+    :maxdepth: 1
+
+    Import catalogs <notebooks/import_catalogs>
diff --git a/docs/notebooks/import_catalogs.ipynb b/docs/notebooks/import_catalogs.ipynb
@@ -0,0 +1,304 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "26685ece166fa3f7",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "# Importing catalogs to HiPSCat format\n",
+    "\n",
+    "This notebook presents two ways of importing catalogs to HiPSCat format. The first uses the __lsdb.from_dataframe()__ method, which is helpful to load smaller catalogs from a single dataframe, while the second uses the __hipscat import pipeline__."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2de4ed424644058",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-11-10T15:14:55.941948Z",
+     "start_time": "2023-11-10T15:14:55.912372Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import lsdb\n",
+    "import os\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6b5f76fe439a32b9",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-11-10T15:14:55.946920Z",
+     "start_time": "2023-11-10T15:14:55.920402Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "catalog_name = \"small_sky_order1\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "59e857888ae9ea70",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-11-10T15:14:55.958272Z",
+     "start_time": "2023-11-10T15:14:55.924340Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Input paths\n",
+    "test_data_dir = os.path.join(\"../../tests\", \"data\")\n",
+    "catalog_dir = os.path.join(test_data_dir, catalog_name)\n",
+    "catalog_csv_path = os.path.join(catalog_dir, f\"{catalog_name}.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "263e7a037919a20b",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-11-10T15:14:55.958815Z",
+     "start_time": "2023-11-10T15:14:55.927288Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Output paths\n",
+    "catalog_from_dataframe = f\"{catalog_name}-from_dataframe\"\n",
+    "catalog_from_importer = f\"{catalog_name}-from_importer\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b5e74432b3437e2f",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "## Using lsdb.from_dataframe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-11-10T15:14:55.987313Z",
+     "start_time": "2023-11-10T15:14:55.934107Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "# Read simple catalog from its CSV file\n",
+    "catalog = lsdb.from_dataframe(\n",
+    "    pd.read_csv(catalog_csv_path),\n",
+    "    catalog_name=catalog_from_dataframe,\n",
+    "    catalog_type=\"object\",\n",
+    "    highest_order=5,\n",
+    "    threshold=100\n",
+    ")\n",
+    "\n",
+    "# Save it to disk in HiPSCat format\n",
+    "catalog.to_hipscat(catalog_from_dataframe)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bc1cc2a7eac29dba",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "## Using the import pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46bca7773cf6b261",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-11-10T15:14:57.130701Z",
+     "start_time": "2023-11-10T15:14:55.985757Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Install hipscat-import\n",
+    "!pip install hipscat-import --quiet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2d1324d21d62c81",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-11-10T15:14:57.134432Z",
+     "start_time": "2023-11-10T15:14:57.131884Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from dask.distributed import Client\n",
+    "from hipscat_import.catalog.arguments import ImportArguments\n",
+    "from hipscat_import.pipeline import pipeline_with_client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d9be67178e901acc",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-11-10T15:14:57.136858Z",
+     "start_time": "2023-11-10T15:14:57.134937Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Create directory if it does not yet exist\n",
+    "os.makedirs(catalog_from_importer, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "59d1538d8545265",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-11-10T15:14:57.172913Z",
+     "start_time": "2023-11-10T15:14:57.138071Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "args = ImportArguments(\n",
+    "    id_column=\"id\",\n",
+    "    ra_column=\"ra\",\n",
+    "    dec_column=\"dec\",\n",
+    "    highest_healpix_order=5,\n",
+    "    pixel_threshold=100,\n",
+    "    input_path=catalog_dir,\n",
+    "    input_format=f\"small_sky_order1.csv\",\n",
+    "    output_catalog_name=catalog_from_importer,\n",
+    "    output_path=\".\",\n",
+    "    dask_tmp=\".\",\n",
+    "    overwrite=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "300eb3d3fcbb7b62",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-11-10T15:15:01.266753Z",
+     "start_time": "2023-11-10T15:14:57.156033Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "with Client(local_directory=args.dask_tmp, n_workers=4) as client:\n",
+    "    pipeline_with_client(args, client)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5cffa1283ddbca39",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "### Load both catalogs and check that they are equivalent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "620ff4e2b7ae291b",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-11-10T15:15:01.288751Z",
+     "start_time": "2023-11-10T15:15:01.264910Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from_dataframe_catalog = lsdb.read_hipscat(catalog_from_dataframe)\n",
+    "from_dataframe_catalog._ddf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "575daf21cd4cfcad",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-11-10T15:15:01.313421Z",
+     "start_time": "2023-11-10T15:15:01.279523Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from_importer_catalog = lsdb.read_hipscat(catalog_from_importer)\n",
+    "from_importer_catalog._ddf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b9d176a8dc303aa0",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-11-10T15:15:01.334413Z",
+     "start_time": "2023-11-10T15:15:01.289748Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Verify that pixels are similar\n",
+    "assert from_dataframe_catalog.get_healpix_pixels() == from_importer_catalog.get_healpix_pixels()\n",
+    "# Verify that resulting dataframes contain the same data\n",
+    "pd.testing.assert_frame_equal(from_dataframe_catalog.compute().sort_index(), from_importer_catalog.compute().sort_index(), check_dtype=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/lsdb/catalog/catalog.py b/src/lsdb/catalog/catalog.py
@@ -1,19 +1,19 @@
 from __future__ import annotations
 
 import dataclasses
-from typing import Dict, List, Tuple, Type, cast
+from typing import Any, Dict, List, Tuple, Type, Union, cast
 
 import dask.dataframe as dd
 import hipscat as hc
 from hipscat.pixel_math import HealpixPixel
 
+from lsdb import io
 from lsdb.catalog.dataset.dataset import Dataset
 from lsdb.core.cone_search import cone_filter
 from lsdb.core.crossmatch.abstract_crossmatch_algorithm import AbstractCrossmatchAlgorithm
 from lsdb.core.crossmatch.crossmatch_algorithms import BuiltInCrossmatchAlgorithm
 from lsdb.dask.crossmatch_catalog_data import crossmatch_catalog_data
-
-DaskDFPixelMap = Dict[HealpixPixel, int]
+from lsdb.types import DaskDFPixelMap
 
 
 # pylint: disable=R0903, W0212
@@ -25,7 +25,6 @@ class Catalog(Dataset):
         hc_structure: `hipscat.Catalog` object representing the structure
                       and metadata of the HiPSCat catalog
     """
-
     hc_structure: hc.catalog.Catalog
 
     def __init__(
@@ -247,3 +246,20 @@ def cone_search(self, ra: float, dec: float, radius: float):
         cone_search_ddf = cast(dd.DataFrame, cone_search_ddf)
         ddf_partition_map = {pixel: i for i, pixel in enumerate(pixels_in_cone)}
         return Catalog(cone_search_ddf, ddf_partition_map, filtered_hc_structure)
+
+    def to_hipscat(
+        self,
+        base_catalog_path: str,
+        catalog_name: Union[str, None] = None,
+        storage_options: Union[Dict[Any, Any], None] = None,
+        **kwargs
+    ):
+        """Saves the catalog to disk in HiPSCat format
+
+        Args:
+            base_catalog_path (str): Location where catalog is saved to
+            catalog_name (str): The name of the catalog to be saved
+            storage_options (dict): Dictionary that contains abstract filesystem credentials
+            **kwargs: Arguments to pass to the parquet write operations
+        """
+        io.to_hipscat(self, base_catalog_path, catalog_name, storage_options, **kwargs)
diff --git a/src/lsdb/dask/crossmatch_catalog_data.py b/src/lsdb/dask/crossmatch_catalog_data.py
@@ -14,9 +14,10 @@
 from lsdb.core.crossmatch.abstract_crossmatch_algorithm import AbstractCrossmatchAlgorithm
 from lsdb.core.crossmatch.crossmatch_algorithms import BuiltInCrossmatchAlgorithm
 from lsdb.core.crossmatch.kdtree_match import KdTreeCrossmatch
+from lsdb.types import DaskDFPixelMap
 
 if TYPE_CHECKING:
-    from lsdb.catalog.catalog import Catalog, DaskDFPixelMap
+    from lsdb.catalog.catalog import Catalog
 
 builtin_crossmatch_algorithms = {BuiltInCrossmatchAlgorithm.KD_TREE: KdTreeCrossmatch}
 

diff --git a/src/lsdb/io/__init__.py b/src/lsdb/io/__init__.py
@@ -0,0 +1 @@
+from .to_hipscat import to_hipscat
Original file line number	Diff line number	Diff line change
Expand Up		@@ -47,3 +47,4 @@ Notes:

		Home page <self>
		API Reference <autoapi/index>
		Notebooks <notebooks>