Skip to content

Commit

Permalink
Use a naive sparse histogram. (#534)
Browse files Browse the repository at this point in the history
* Use a naive sparse histogram.

* Remove make_from_counts
  • Loading branch information
delucchi-cmu authored Jan 6, 2025
1 parent c8082b6 commit 965ef8f
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 7 deletions.
13 changes: 6 additions & 7 deletions src/lsdb/io/to_hats.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,12 @@

import dask
import hats as hc
import hats.pixel_math.healpix_shim as hp
import nested_pandas as npd
import numpy as np
from hats.catalog import PartitionInfo
from hats.catalog.healpix_dataset.healpix_dataset import HealpixDataset as HCHealpixDataset
from hats.pixel_math import HealpixPixel, spatial_index_to_healpix
from hats.pixel_math.sparse_histogram import SparseHistogram
from hats.pixel_math.sparse_histogram import HistogramAggregator, SparseHistogram
from upath import UPath

if TYPE_CHECKING:
Expand Down Expand Up @@ -42,7 +41,7 @@ def perform_write(
at the specified order.
"""
if len(df) == 0:
return 0, SparseHistogram.make_empty(histogram_order)
return 0, SparseHistogram([], [], histogram_order)
pixel_dir = hc.io.pixel_directory(base_catalog_dir, hp_pixel.order, hp_pixel.pixel)
hc.io.file_io.make_directory(pixel_dir, exist_ok=True)
pixel_path = hc.io.paths.pixel_catalog_file(base_catalog_dir, hp_pixel)
Expand All @@ -64,7 +63,7 @@ def calculate_histogram(df: npd.NestedFrame, histogram_order: int) -> SparseHist
order_pixels = spatial_index_to_healpix(df.index.to_numpy(), target_order=histogram_order)
gb = df.groupby(order_pixels, sort=False).apply(len)
indexes, counts_at_indexes = gb.index.to_numpy(), gb.to_numpy(na_value=0)
return SparseHistogram.make_from_counts(indexes, counts_at_indexes, histogram_order)
return SparseHistogram(indexes, counts_at_indexes, histogram_order)


# pylint: disable=protected-access
Expand Down Expand Up @@ -116,11 +115,11 @@ def to_hats(
)
new_hc_structure.catalog_info.to_properties_file(base_catalog_path)
# Save the point distribution map
full_histogram = np.zeros(hp.order2npix(histogram_order))
total_histogram = HistogramAggregator(histogram_order)
for partition_hist in histograms:
full_histogram += partition_hist.to_array()
total_histogram.add(partition_hist)
point_map_path = hc.io.paths.get_point_map_file_pointer(base_catalog_path)
hc.io.file_io.write_fits_image(full_histogram, point_map_path)
hc.io.file_io.write_fits_image(total_histogram.full_histogram, point_map_path)


def write_partitions(
Expand Down
28 changes: 28 additions & 0 deletions tests/lsdb/catalog/test_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,34 @@ def test_save_catalog_when_catalog_is_empty(small_sky_order1_catalog, tmp_path):
cone_search_catalog.to_hats(base_catalog_path)


def test_save_big_catalog(tmp_path):
"""Load a catalog with many partitions, and save with to_hats."""
mock_partition_df = pd.DataFrame(
{
"ra": np.linspace(0, 360, 100_000),
"dec": np.linspace(-90, 90, 100_000),
"id": np.arange(100_000, 200_000),
}
)

base_catalog_path = tmp_path / "big_sky"

kwargs = {
"catalog_name": "big_sky",
"catalog_type": "object",
"lowest_order": 6,
"highest_order": 10,
"threshold": 500,
}

catalog = lsdb.from_dataframe(mock_partition_df, margin_threshold=None, **kwargs)

catalog.to_hats(base_catalog_path)

read_catalog = hc.read_hats(base_catalog_path)
assert len(read_catalog.get_healpix_pixels()) == len(catalog.get_healpix_pixels())


def test_save_catalog_with_some_empty_partitions(small_sky_order1_catalog, tmp_path):
base_catalog_path = tmp_path / "small_sky"

Expand Down

0 comments on commit 965ef8f

Please sign in to comment.