-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #61 from astronomy-commons/issue/50/catalog-saving
Save catalog to disk
- Loading branch information
Showing
26 changed files
with
672 additions
and
156 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,3 +47,4 @@ Notes: | |
|
||
Home page <self> | ||
API Reference <autoapi/index> | ||
Notebooks <notebooks> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
Notebooks | ||
======================================================================================== | ||
|
||
.. toctree:: | ||
:maxdepth: 1 | ||
|
||
Import catalogs <notebooks/import_catalogs> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,304 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "26685ece166fa3f7", | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"source": [ | ||
"# Importing catalogs to HiPSCat format\n", | ||
"\n", | ||
"This notebook presents two ways of importing catalogs to HiPSCat format. The first uses the __lsdb.from_dataframe()__ method, which is helpful to load smaller catalogs from a single dataframe, while the second uses the __hipscat import pipeline__." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "c2de4ed424644058", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2023-11-10T15:14:55.941948Z", | ||
"start_time": "2023-11-10T15:14:55.912372Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import lsdb\n", | ||
"import os\n", | ||
"import pandas as pd" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "6b5f76fe439a32b9", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2023-11-10T15:14:55.946920Z", | ||
"start_time": "2023-11-10T15:14:55.920402Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"catalog_name = \"small_sky_order1\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "59e857888ae9ea70", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2023-11-10T15:14:55.958272Z", | ||
"start_time": "2023-11-10T15:14:55.924340Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Input paths\n", | ||
"test_data_dir = os.path.join(\"../../tests\", \"data\")\n", | ||
"catalog_dir = os.path.join(test_data_dir, catalog_name)\n", | ||
"catalog_csv_path = os.path.join(catalog_dir, f\"{catalog_name}.csv\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "263e7a037919a20b", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2023-11-10T15:14:55.958815Z", | ||
"start_time": "2023-11-10T15:14:55.927288Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Output paths\n", | ||
"catalog_from_dataframe = f\"{catalog_name}-from_dataframe\"\n", | ||
"catalog_from_importer = f\"{catalog_name}-from_importer\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "b5e74432b3437e2f", | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"source": [ | ||
"## Using lsdb.from_dataframe()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "initial_id", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2023-11-10T15:14:55.987313Z", | ||
"start_time": "2023-11-10T15:14:55.934107Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"%%time\n", | ||
"\n", | ||
"# Read simple catalog from its CSV file\n", | ||
"catalog = lsdb.from_dataframe(\n", | ||
" pd.read_csv(catalog_csv_path),\n", | ||
" catalog_name=catalog_from_dataframe,\n", | ||
" catalog_type=\"object\",\n", | ||
" highest_order=5,\n", | ||
" threshold=100\n", | ||
")\n", | ||
"\n", | ||
"# Save it to disk in HiPSCat format\n", | ||
"catalog.to_hipscat(catalog_from_dataframe)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "bc1cc2a7eac29dba", | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"source": [ | ||
"## Using the import pipeline" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "46bca7773cf6b261", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2023-11-10T15:14:57.130701Z", | ||
"start_time": "2023-11-10T15:14:55.985757Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Install hipscat-import\n", | ||
"!pip install hipscat-import --quiet" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "d2d1324d21d62c81", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2023-11-10T15:14:57.134432Z", | ||
"start_time": "2023-11-10T15:14:57.131884Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"from dask.distributed import Client\n", | ||
"from hipscat_import.catalog.arguments import ImportArguments\n", | ||
"from hipscat_import.pipeline import pipeline_with_client" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "d9be67178e901acc", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2023-11-10T15:14:57.136858Z", | ||
"start_time": "2023-11-10T15:14:57.134937Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Create directory if it does not yet exist\n", | ||
"os.makedirs(catalog_from_importer, exist_ok=True)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "59d1538d8545265", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2023-11-10T15:14:57.172913Z", | ||
"start_time": "2023-11-10T15:14:57.138071Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"args = ImportArguments(\n", | ||
" id_column=\"id\",\n", | ||
" ra_column=\"ra\",\n", | ||
" dec_column=\"dec\",\n", | ||
" highest_healpix_order=5,\n", | ||
" pixel_threshold=100,\n", | ||
" input_path=catalog_dir,\n", | ||
" input_format=f\"small_sky_order1.csv\",\n", | ||
" output_catalog_name=catalog_from_importer,\n", | ||
" output_path=\".\",\n", | ||
" dask_tmp=\".\",\n", | ||
" overwrite=True,\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "300eb3d3fcbb7b62", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2023-11-10T15:15:01.266753Z", | ||
"start_time": "2023-11-10T15:14:57.156033Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"%%time\n", | ||
"with Client(local_directory=args.dask_tmp, n_workers=4) as client:\n", | ||
" pipeline_with_client(args, client)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "5cffa1283ddbca39", | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"source": [ | ||
"### Load both catalogs and check that they are equivalent" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "620ff4e2b7ae291b", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2023-11-10T15:15:01.288751Z", | ||
"start_time": "2023-11-10T15:15:01.264910Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"from_dataframe_catalog = lsdb.read_hipscat(catalog_from_dataframe)\n", | ||
"from_dataframe_catalog._ddf" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "575daf21cd4cfcad", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2023-11-10T15:15:01.313421Z", | ||
"start_time": "2023-11-10T15:15:01.279523Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"from_importer_catalog = lsdb.read_hipscat(catalog_from_importer)\n", | ||
"from_importer_catalog._ddf" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "b9d176a8dc303aa0", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2023-11-10T15:15:01.334413Z", | ||
"start_time": "2023-11-10T15:15:01.289748Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Verify that pixels are similar\n", | ||
"assert from_dataframe_catalog.get_healpix_pixels() == from_importer_catalog.get_healpix_pixels()\n", | ||
"# Verify that resulting dataframes contain the same data\n", | ||
"pd.testing.assert_frame_equal(from_dataframe_catalog.compute().sort_index(), from_importer_catalog.compute().sort_index(), check_dtype=False)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 2 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython2", | ||
"version": "2.7.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .to_hipscat import to_hipscat |
Oops, something went wrong.