From 223450839b5a4d50bcbc6ae3c247e63d4d4bd2bc Mon Sep 17 00:00:00 2001 From: Melissa DeLucchi Date: Wed, 5 Jun 2024 13:09:10 -0400 Subject: [PATCH] checkpoint --- incubator/mask/cone_extrema.ipynb | 239 +++++++++++++++++++++++++++ incubator/mask/min_max_hist.ipynb | 263 ++++++++++++++++++++++++++++++ 2 files changed, 502 insertions(+) create mode 100644 incubator/mask/cone_extrema.ipynb create mode 100644 incubator/mask/min_max_hist.ipynb diff --git a/incubator/mask/cone_extrema.ipynb b/incubator/mask/cone_extrema.ipynb new file mode 100644 index 0000000..60cbd10 --- /dev/null +++ b/incubator/mask/cone_extrema.ipynb @@ -0,0 +1,239 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "import hipscat\n", + "import healpy as hp\n", + "import pandas as pd\n", + "from tqdm import tqdm\n", + "from hipscat.inspection import plot_pixel_list\n", + "from hipscat.pixel_math import HealpixPixel\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "# plot_pixel_list([HealpixPixel(0,11), HealpixPixel(4, 78)])" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "step = 2" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "## i know this is dumb. i don't care.\n", + "\n", + "def min_max_sep(bounds_a, bounds_b):\n", + " min_sep = float(\"inf\")\n", + " max_sep = 0.0\n", + "\n", + " for i in range(4*step):\n", + " for j in range (4*step):\n", + " sep_sq = (bounds_a[0][i]-bounds_b[0][j])**2 + (bounds_a[1][i]-bounds_b[1][j])**2\n", + " min_sep = min(min_sep, sep_sq)\n", + " max_sep = max(max_sep, sep_sq)\n", + " return (min_sep, max_sep)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3933 partitions\n", + "7732278 iterations\n" + ] + } + ], + "source": [ + "\n", + "\n", + "gaia_full_partition_frame = pd.read_csv(\"gaia_partition_info.csv\")\n", + "gaia_full_partition_list = [\n", + " HealpixPixel(order, pixel)\n", + " for order, pixel in zip(\n", + " gaia_full_partition_frame[\"Norder\"],\n", + " gaia_full_partition_frame[\"Npix\"],\n", + " )\n", + " ]\n", + "num_partitions = len(gaia_full_partition_list)\n", + "print(num_partitions, \"partitions\")\n", + "print(int(.5 * num_partitions * (num_partitions - 1)), \"iterations\")" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 3933/3933 [09:51<00:00, 6.64it/s] \n" + ] + } + ], + "source": [ + "\n", + "all_seps = []\n", + "\n", + "for a in tqdm(range(0, num_partitions)):\n", + " for b in range(a, num_partitions):\n", + "\n", + " bounds_a = hp.vec2dir(hp.boundaries(2**gaia_full_partition_list[a].order, gaia_full_partition_list[a].pixel, step=step, nest=True), lonlat=True)\n", + " bounds_b = hp.vec2dir(hp.boundaries(2**gaia_full_partition_list[b].order, gaia_full_partition_list[b].pixel, step=step, nest=True), lonlat=True)\n", + " all_seps.append(min_max_sep(bounds_a, bounds_b))" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7736211" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(all_seps)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(0.0, 506.24999999999966),\n", + " (0.0, 1236.6206396835553),\n", + " (6299.120639683556, 15411.620639683555),\n", + " (4972.652051530026, 13072.652051530025),\n", + " (8516.402051530025, 18641.402051530025),\n", + " (11289.449049432578, 23391.34255485142),\n", + " (7239.44904943258, 16351.949049432578),\n", + " (10791.34255485142, 24620.924450944753),\n", + " (6066.342554851419, 15516.34255485142),\n", + " (9433.42445094475, 26107.389504989897)]" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_seps[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "min_map_arrays = np.array(all_seps).T\n", + "min_map_arrays[0:10]\n", + "mins = min_map_arrays[0]\n", + "maxs = min_map_arrays[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 3933/3933 [00:01<00:00, 2135.54it/s]\n" + ] + } + ], + "source": [ + "pix_a_order = []\n", + "pix_a_pixel = []\n", + "pix_b_order = []\n", + "pix_b_pixel = []\n", + "\n", + "for a in tqdm(range(0, num_partitions)):\n", + " for b in range(a, num_partitions):\n", + " pix_a_order.append(gaia_full_partition_list[a].order)\n", + " pix_a_pixel.append(gaia_full_partition_list[a].pixel)\n", + " pix_b_order.append(gaia_full_partition_list[b].order)\n", + " pix_b_pixel.append(gaia_full_partition_list[b].pixel)\n", + "\n", + "\n", + "big_beautiful_frame = pd.DataFrame({\"Norder_a\": pix_a_order,\n", + " \"Npix_a\": pix_a_pixel,\n", + " \"Norder_b\": pix_b_order,\n", + " \"Npix_b\": pix_b_pixel,\n", + " \"min_sep\": mins,\n", + " \"max_sep\": maxs,\n", + " })\n", + "big_beautiful_frame.to_csv(\"bbf.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "hipscatenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/incubator/mask/min_max_hist.ipynb b/incubator/mask/min_max_hist.ipynb new file mode 100644 index 0000000..8241cf0 --- /dev/null +++ b/incubator/mask/min_max_hist.ipynb @@ -0,0 +1,263 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import hipscat\n", + "import healpy as hp\n", + "import pandas as pd\n", + "from tqdm import tqdm\n", + "from hipscat.inspection import plot_pixel_list\n", + "from hipscat.pixel_math import HealpixPixel\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "big_beautiful_frame = pd.read_csv(\"bbf.csv\")\n", + "\n", + "# sep_bins = [1, 10, 100, 1_000, 10_000, 50_000, 155_000]\n", + "# sep_bins = [1, 10, 100, 1_000, 10_000, 20_000, 30_000, 40_000, 50_000, 75_000]\n", + "sep_bins = [1, 10, 100, 500, 1_000, 2_000, 3_000, 4_000, 5_000, 7_500, 10_000]\n", + "\n", + "H, xedges, yedges = np.histogram2d(big_beautiful_frame[\"min_sep\"], \n", + " big_beautiful_frame[\"max_sep\"],\n", + " bins=(sep_bins, sep_bins))\n", + "H = H.T\n", + "\n", + "fig, ax = plt.subplots()\n", + "X, Y = np.meshgrid(xedges, yedges)\n", + "pc = ax.pcolormesh(X, Y, H, shading='flat', cmap=\"rainbow_r\", norm=\"log\")\n", + "ax.set_title(\"\")\n", + "ax.set_xlabel('min separation')\n", + "ax.set_ylabel('max separation')\n", + "# ax.set_yscale('log')\n", + "# ax.set_yscale('log')\n", + "fig.colorbar(pc)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "154558.46597361984" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "big_beautiful_frame[\"max_sep\"].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([5, 6, 3, 4, 2, 2, 3, 2, 3, 3])" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sep_bins = [1, 10, 100, 500, 1_000, 2_000, 3_000, 4_000, 5_000, 7_500, 10_000, 20_000, 30_000, 40_000, 50_000,100_000, 155_000]\n", + "# len(sep_bins)\n", + "\n", + "right = np.searchsorted(sep_bins, big_beautiful_frame[\"min_sep\"])\n", + "left = np.searchsorted(sep_bins, big_beautiful_frame[\"max_sep\"])\n", + "span = left - right + 1\n", + "span[0:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "bin_hist, bin_bins = np.histogram(span, bins = np.arange(1, 19))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "width = np.diff(bin_bins)\n", + "center = (bin_bins[:-1] + bin_bins[1:]) / 2\n", + "\n", + "fig, ax = plt.subplots(figsize=(15,3))\n", + "ax.bar(center, bin_hist, align='center', width=width)\n", + "# ax.set_yscale('log')\n", + "plt.title(f\"num sep bins, for pairwise hipscat pixels\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([4680685, 2606439, 223729, 43134, 55660, 36807, 31794,\n", + " 12031, 9324, 8515, 4957, 7110, 5777, 7190,\n", + " 2282, 378, 399])" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bin_hist" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "num_pairs = np.sum(bin_hist)\n", + "proportion = bin_hist / num_pairs" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([6.05035850e-01, 3.36914156e-01, 2.89197128e-02, 5.57559767e-03,\n", + " 7.19473654e-03, 4.75775544e-03, 4.10976381e-03, 1.55515407e-03,\n", + " 1.20524117e-03, 1.10066801e-03, 6.40752947e-04, 9.19054560e-04,\n", + " 7.46747988e-04, 9.29395540e-04, 2.94976443e-04, 4.88611285e-05,\n", + " 5.15756357e-05])" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "proportion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "60% of tile pairs will have only one possible separation bin. As a result, we only need the total masked count for these tile-pairs.\n", + "\n", + "\n", + "33% will have two possible bins. Once pair separations have been determined, the binning clause can be quickly short-circuited. We will know BEFORE loading the parquet tiles into memory what the possible bins are. Can pass just the dividing line between bins to counting routine! Could just return `count sep less than X`.\n", + "\n", + "< 3% will have three possible bins.\n", + "\n", + "Remaining 3% of tile pairs have 4+ possible bins. These are the smallest separations, and the ones that we will spend the most time on (and where we SHOULD be spending the most time getting the counts right!\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0291302809605374" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "long_tail_tile_pairs = np.sum(bin_hist[3:])\n", + "long_tail_tile_pairs / np.sum(bin_hist)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "hipscatenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}