mb010 · mb010 · Oct 1, 2024 · Sep 30, 2024 · Sep 30, 2024 · Sep 30, 2024
diff --git a/examples/lotssdr2/Create_LoTTSDataset.ipynb b/examples/lotssdr2/Create_LoTTSDataset.ipynb
@@ -1,64 +1,208 @@
 {
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e06b0a43-62e7-48b5-a009-e37e691024c9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from data import LoTTSDataset\n",
-    "from torchvision.transforms import v2\n",
-    "import torch\n",
-    "\n",
-    "transforms = v2.Compose(\n",
-    "    [\n",
-    "        v2.ToImage(),\n",
-    "        v2.ToDtype(torch.float32),\n",
-    "        v2.Resize(size=(64, 64)),\n",
-    "    ]\n",
-    ")\n",
-    "\n",
-    "data = LoTTSDataset(\n",
-    "    data_folder=\"./data/lotssdr2/public\",  # Change this to where you saved your data\n",
-    "    cutout_scaling=1.5,\n",
-    "    transform=transforms,\n",
-    ")\n",
-    "\n",
-    "for i in range(len(data)):\n",
-    "    if i > 10:\n",
-    "        break\n",
-    "    data.plot(\n",
-    "        i,\n",
-    "        contours=True,\n",
-    "        sigma_name=\"Isl_rms\",\n",
-    "        min_sigma=2,\n",
-    "        title=data.df.iloc[i][\"Source_Name\"] + data.df.iloc[i][\"S_Code\"],\n",
-    "    )\n",
-    "\n",
-    "data.df.head()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "view-in-github"
+      },
+      "source": [
+        "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mb010/Cata2Data/blob/main/examples/lotssdr2/Create_LoTTSDataset.ipynb\">\n",
+        "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
+        "</a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0p1nWoeAcBL1"
+      },
+      "source": [
+        "## Create a LoTTS Dataset Using Cata2Data"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IewboBsacOya"
+      },
+      "source": [
+        "To start, create a local clone of this repository\n",
+        "\n",
+        "Install cata2data into your local environment (We recommend that you should use a venv on your local machine).\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "4OjNn-aHGrye"
+      },
+      "outputs": [],
+      "source": [
+        "!git clone https://github.com/mb010/Cata2Data.git && pip install ./Cata2Data && cp Cata2Data/examples/lotssdr2/data.py ."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "hszOwcQicuL7"
+      },
+      "source": [
+        "## Download the data\n",
+        "\n",
+        "Use the `data_scrapper.py` script to download the image files. If you want to just download one pointing (instead of all 841 pointings; 434 GB), then call it using the --test flag:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "lAC3ao-_H36q"
+      },
+      "outputs": [],
+      "source": [
+        "%%python Cata2Data/examples/lotssdr2/data_scrapper.py --dir downloaded_data/ --test"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "nRpXlcpgf3y4"
+      },
+      "source": [
+        "This will have downloaded a .fits image file"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "bEKXzeCNf3ZZ"
+      },
+      "outputs": [],
+      "source": [
+        "!ls downloaded_data/public/DR2/mosaics/P000+23/"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "E-GKT9tMgZmg"
+      },
+      "source": [
+        "Next, you need to download the catalog directly from the website at this link (3.9 GB). This dataloader is currently built to work with the [Radio-optical cross match](https://lofar-surveys.org/dr2_release.html#:~:text=Radio%2Doptical%20crossmatch%20catalogue) catalog described in [Hardcastle et al. 2023](https://arxiv.org/abs/2309.00102)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "FH9feKhYgZNg"
+      },
+      "outputs": [],
+      "source": [
+        "!wget -P downloaded_data/ https://lofar-surveys.org/public/DR2/catalogues/combined-release-v1.1-LM_opt_mass.fits"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gD-CHCdVh4mk"
+      },
+      "source": [
+        "## Split the Catalogue\n",
+        "\n",
+        "This will take the full catalog and split it into one catalog per image and save those into the folder where each of those images is stored. This is what Cata2Data currently expects - lists of images and catalogs with equal length to use to construct a dataloader."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "DOkJxKa4rFsI"
+      },
+      "outputs": [],
+      "source": [
+        "%%python /content/Cata2Data/examples/lotssdr2/catalog_splitter.py --catalog_path downloaded_data/combined-release-v1.1-LM_opt_mass.fits --image_paths downloaded_data/public/DR2/mosaics/P000+23/"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "sISui8RyxNzY"
+      },
+      "source": [
+        "## Construct the dataset\n",
+        "\n",
+        "Running the example cell below will construct a dataset from the data that has been downloaded. The LoTTSDataset class is imported from the [data.py file](https://github.com/mb010/Cata2Data/blob/main/examples/lotssdr2/data.py) before being populated with data from the `downloaded_data` directory. We then plot images for the first ten members of the dataset and print the first ten rows of the corresponding dataframe."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "AnbFg6PyVx21"
+      },
+      "outputs": [],
+      "source": [
+        "from data import LoTTSDataset\n",
+        "from torchvision.transforms import v2\n",
+        "import torch\n",
+        "\n",
+        "transforms = v2.Compose(\n",
+        "    [\n",
+        "        v2.ToImage(),\n",
+        "        v2.ToDtype(torch.float32),\n",
+        "        v2.Resize(size=(64, 64)),\n",
+        "    ]\n",
+        ")\n",
+        "\n",
+        "data = LoTTSDataset(\n",
+        "    data_folder=\"downloaded_data\",  # Change this to where you saved your data\n",
+        "    cutout_scaling=1.5,\n",
+        "    transform=transforms,\n",
+        ")\n",
+        "\n",
+        "for i in range(len(data)):\n",
+        "    if i > 10:\n",
+        "        break\n",
+        "    data.plot(\n",
+        "        i,\n",
+        "        contours=True,\n",
+        "        sigma_name=\"Isl_rms\",\n",
+        "        min_sigma=2,\n",
+        "        title=data.df.iloc[i][\"Source_Name\"] + data.df.iloc[i][\"S_Code\"],\n",
+        "    )\n",
+        "\n",
+        "data.df.head(10)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "HvmK7UdizYEo"
+      },
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "authorship_tag": "ABX9TyOOak3TSj8ruaDx439hKsct",
+      "include_colab_link": true,
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
   },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
+  "nbformat": 4,
+  "nbformat_minor": 0
 }
diff --git a/examples/lotssdr2/README.md b/examples/lotssdr2/README.md
@@ -1,7 +1,7 @@
 # LOTSS DR2 Data Class
 
 This folder contains the utilities to produce a full dataloder for LOTSS DR2 using [Cata2Data](https://github.com/mb010/Cata2Data).
-The dataloader was initially developed for a different project. It serves to highlight how powerfull of a tool [Cata2Data](https://github.com/mb010/Cata2Data) can be.
+The dataloader was initially developed for a different project. It serves to highlight how powerful of a tool [Cata2Data](https://github.com/mb010/Cata2Data) can be.
 
 # Quick walkthrough:
 To start, create a local clone of this repository and navigate to this directory.
@@ -33,7 +33,7 @@ python catalog_splitter.py --catalog_path PATH_TO_THE_FULL_CATALOG --image_paths
 This will take the full catalog and split it into one catalog per image and save those into the folder where each of those images is stored. This is what Cata2Data currently expects - lists of images and catalogs with equal length to use to construct a dataloader.
 
 ## Construct the dataset
-A number of decisions have been made in the selection of sources etc, but in general everything is in [the data.py file](data.py). To run the code below you can install Jupyter to your environment using `pip install notebook` then open a Jupyter notebook using the command `jupyter notebook Create_LoTTSDataset.ipynb`.
+A number of decisions have been made in the selection of sources etc, but in general everything is in [the data.py file](data.py). To run the code below you can run the `Create_LoTTSDataset.ipynb` notebook in Colab.
 
 ```python
 from data import LoTTSDataset

diff --git a/examples/lotssdr2/catalog_splitter.py b/examples/lotssdr2/catalog_splitter.py
@@ -8,9 +8,8 @@
 
 def load_catalogue(catalogue_path):
     """Load the FITS catalogue using astropy Table and convert to a pandas DataFrame."""
-    table = Table.read(catalogue_path)
-    df = table.to_pandas()
-    return df
+    table = Table.read(catalogue_path, memmap=True)
+    return table
 
 
 def get_image_wcs_and_data(image_path):
@@ -21,21 +20,18 @@ def get_image_wcs_and_data(image_path):
     return wcs, image_data
 
 
-def save_sub_catalogue(df, output_path, overwrite=True):
+def save_sub_catalogue(table, output_path, overwrite=True):
     """Save the sub-catalogue from a pandas DataFrame to a new FITS file using astropy Table."""
-    table = Table.from_pandas(df)
     table.write(output_path, format="fits", overwrite=overwrite)
 
 
-def filter_objects_by_central_pixel(df, wcs, image_data):
+def filter_objects_by_central_pixel(table, wcs, image_data):
     """Filter objects within the RA and DEC boundaries and valid data regions using pandas."""
-    df = df.dropna(subset=["RA", "DEC"])
-
     # Convert RA and DEC to pixel coordinates
-    x, y = wcs.wcs_world2pix(df["RA"].values, df["DEC"].values, 0)
+    x, y = wcs.wcs_world2pix(table["RA"], table["DEC"], 0)
 
     # Initialize mask with all True values
-    valid_mask = np.ones(len(df), dtype=bool)
+    valid_mask = np.ones(len(table), dtype=bool)
 
     # Check bounds
     valid_mask &= (
@@ -49,21 +45,25 @@ def filter_objects_by_central_pixel(df, wcs, image_data):
     )
 
     # Filter DataFrame
-    valid_df = df[valid_mask]
-    print(f"STRONG FILTER: samples: {valid_df.shape[0]}")
+    valid_table = table[valid_mask]
+    print(f"STRONG FILTER: samples: {len(valid_table)}")
 
-    return valid_df
+    return valid_table
 
 
-def filter_by_mosaic_id(df, image_path):
-    df = df.dropna(subset=["RA", "DEC"])
+def filter_by_mosaic_id(table, image_path):
+    for col in [table["RA"], table["DEC"]]:
+        has_nan = np.zeros(len(table), dtype=bool)
+        if col.info.dtype.kind == "f":
+            has_nan |= np.isnan(col)
+        table = table[~has_nan]
     field_name = os.path.dirname(image_path).split("/")[-1]
-    original_sample_count = df.shape[0]
-    df = df.loc[df["Mosaic_ID"] == field_name.encode("UTF-8")]
+    original_sample_count = len(table)
+    table = table[table["Mosaic_ID"] == field_name.encode("UTF-8")]
     print(
-        f"SIMPLE FILTER: field_name: {field_name}; samples: {df.shape[0]}; original_sample_count: {original_sample_count}; sample_estimate: {int(1/841*original_sample_count)}"
+        f"SIMPLE FILTER: field_name: {field_name}; samples: {len(table)}; original_sample_count: {original_sample_count}; sample_estimate: {int(1/841*original_sample_count)}"
     )
-    return df
+    return table
 
 
 def main(catalogue_path, image_paths):
@@ -87,8 +87,8 @@ def main(catalogue_path, image_paths):
     argparser = argparse.ArgumentParser()
     argparser.add_argument(
         "-c",
-        "--catalogue_path",
-        help="Path to the FITS catalogue.",
+        "--catalog_path",
+        help="Path to the FITS catalog.",
         default="data/lotssdr2/combined-release-v1.1-LM_opt_mass.fits",
     )
     argparser.add_argument(
@@ -111,4 +111,4 @@ def main(catalogue_path, image_paths):
     else:
         image_paths = args.image_paths.split(",")
     print(image_paths)
-    main(args.catalogue_path, image_paths)
+    main(args.catalog_path, image_paths)