From 8e1dbc6abb24fc5646b89439398c3b60d409dea3 Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Wed, 28 Aug 2024 23:41:29 -0700
Subject: [PATCH 01/16] Update coordinate integration workflow to ensure memory
 efficiency

---
 src/maldi_tools/extraction.py  | 45 +++++++++++++++++-----------------
 src/maldi_tools/plotting.py    | 33 ++++++++++++++++++++-----
 templates/maldi-pipeline.ipynb | 39 ++++-------------------------
 3 files changed, 55 insertions(+), 62 deletions(-)

diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py
index cab87dd..7f34371 100644
--- a/src/maldi_tools/extraction.py
+++ b/src/maldi_tools/extraction.py
@@ -10,11 +10,11 @@
 from functools import partial
 from operator import itemgetter
 from pathlib import Path
-from typing import Dict, Tuple
+from typing import Dict, List, Tuple
 
 import numpy as np
 import pandas as pd
-import xarray as xr
+from alpineer import image_utils, io_utils
 from pyimzml.ImzMLParser import ImzMLParser
 from scipy import signal
 from tqdm.notebook import tqdm
@@ -232,21 +232,20 @@ def peak_spectra(
     return panel_df
 
 
-def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser) -> xr.DataArray:
+def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extraction_dir: Path) -> None:
     """Integrates the coordinates with the discovered, post-processed peaks and generates an image for
     each of the peaks using the imzML coordinate data.
 
+    Saves the peak images to specified extraction_dir.
+
     Args:
     ----
         peak_df (pd.DataFrame): The unique peaks from the data.
         imz_data (ImzMLParser): The imzML object.
-
-    Returns:
-    -------
-        xr.DataArray: A data structure which holds all the images for each peak.
+        extraction_dir (Path): The directory to save extracted data (peak images) in.
     """
     unique_peaks = peak_df["peak"].unique()
-    peak_dict = dict(zip(unique_peaks, np.arange((len(unique_peaks)))))
+    dict(zip(unique_peaks, np.arange((len(unique_peaks)))))
 
     imz_coordinates: list = imz_data.coordinates
 
@@ -255,23 +254,23 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser) -> xr.D
 
     image_shape: Tuple[int, int] = (x_size, y_size)
 
-    imgs = np.zeros((len(unique_peaks), *image_shape))
+    os.makedirs(extraction_dir / "float")
+    os.makedirs(extraction_dir / "int")
 
     for idx, (x, y, _) in tqdm(enumerate(imz_data.coordinates), total=len(imz_data.coordinates)):
+        peak_img: np.ndarray = np.zeros((1, *image_shape))
         mzs, intensities = imz_data.getspectrum(idx)
-
         intensity: np.ndarray = intensities[np.isin(mzs, peak_df["m/z"])]
 
         for i_idx, peak in peak_df.loc[peak_df["m/z"].isin(mzs), "peak"].reset_index(drop=True).items():
-            imgs[peak_dict[peak], x - 1, y - 1] += intensity[i_idx]
-
-    img_data = xr.DataArray(
-        data=imgs,
-        coords={"peak": unique_peaks, "x": range(x_size), "y": range(y_size)},
-        dims=["peak", "x", "y"],
-    )
-
-    return img_data
+            peak_img[x - 1, y - 1] += intensity[i_idx]
+            peak_img_float: np.ndarray = peak_img.T
+            peak_img_int: np.ndarray = (peak_img_float * (2**32 - 1) / np.max(peak_img_float)).astype(
+                np.uint32
+            )
+            img_name: str = f"{peak:.4f}".replace(".", "_")
+            image_utils.save_image(fname=extraction_dir / "float" / f"{img_name}.tiff", data=peak_img_float)
+            image_utils.save_image(fname=extraction_dir / "int" / f"{img_name}.tiff", data=peak_img_int)
 
 
 def _matching_vec(obs_mz: pd.Series, library_peak_df: pd.DataFrame, ppm: int) -> pd.Series:
@@ -301,7 +300,6 @@ def _matching_vec(obs_mz: pd.Series, library_peak_df: pd.DataFrame, ppm: int) ->
 
 
 def library_matching(
-    image_xr: xr.DataArray,
     library_peak_df: pd.DataFrame,
     ppm: int,
     extraction_dir: Path,
@@ -312,7 +310,6 @@ def library_matching(
 
     Args:
     ----
-        image_xr (xr.DataArray): A data structure which holds all the images for each peak.
         library_peak_df (pd.DataFrame): The library of interest to match the observed peaks with.
         ppm (int): The ppm for an acceptable mass error range between the observed mass and any target
         mass in the library.
@@ -324,7 +321,11 @@ def library_matching(
         pd.DataFrame: Contains the peak, the library target mass, a boolean stating if a match was found
         or not, the composition name and the mass error if a match was found or not.
     """
-    peak_df = pd.DataFrame({"peak": image_xr.peak.to_numpy()})
+    peak_list: List[float] = [
+        float(p.replace("_", "."))
+        for p in io_utils.remove_file_extensions(io_utils.list_files(extraction_dir / "float"))
+    ]
+    peak_df = pd.DataFrame({"peak": np.array(peak_list)})
     match_fun = partial(_matching_vec, library_peak_df=library_peak_df, ppm=ppm)
 
     peak_df[["lib_mz", "matched", "composition", "mass_error"]] = peak_df["peak"].apply(
diff --git a/src/maldi_tools/plotting.py b/src/maldi_tools/plotting.py
index 88c0d6c..d6ef36a 100644
--- a/src/maldi_tools/plotting.py
+++ b/src/maldi_tools/plotting.py
@@ -11,6 +11,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import skimage.io as io
 import xarray as xr
 from alpineer import image_utils
 from tqdm.notebook import tqdm
@@ -152,8 +153,26 @@ def save_peak_images(image_xr: xr.DataArray, extraction_dir: Path) -> None:
         image_utils.save_image(fname=int_dir / f"{img_name}.tiff", data=integer_img)
 
 
+def plot_peak_hist(peak: float, bin_count: int, extraction_dir: Path) -> None:
+    """Plot a histogram of the intensities of a provided peak image.
+
+    Args:
+    ----
+        peak (float): The desired peak to visualize
+        bin_count (int): The bin size to use for the histogram
+        extraction_dir (Path): The directory the peak images are saved in
+    """
+    # verify that the peak provided exists
+    peak_path = extraction_dir / f"{str(peak).replace('.', '_')}.tiff"
+    if not os.path.exists(peak_path):
+        raise FileNotFoundError(f"Peak {peak} does not have a corresponding peak image in {extraction_dir}")
+
+    # load the peak image in and display histogram
+    peak_img: np.ndarray = io.imread(peak_path)
+    plt.hist(peak_img.values, bins=bin_count)
+
+
 def save_matched_peak_images(
-    image_xr: xr.DataArray,
     matched_peaks_df: pd.DataFrame,
     extraction_dir: Path,
 ) -> None:
@@ -161,7 +180,6 @@ def save_matched_peak_images(
 
     Args:
     ----
-        image_xr (xr.DataArray): A data structure which holds all the images for each peak.
         matched_peaks_df (pd.DataFrame): A dataframe containing the peaks matched with the library.
         extraction_dir (Path): The directory to save extracted data in.
     """
@@ -175,10 +193,13 @@ def save_matched_peak_images(
     matched_peaks_df_filtered: pd.DataFrame = matched_peaks_df.dropna()
 
     for row in tqdm(matched_peaks_df_filtered.itertuples(), total=len(matched_peaks_df_filtered)):
-        image_index = row.Index
-
-        float_img: np.ndarray = image_xr[image_index, ...].values.T
-        integer_img: np.ndarray = (float_img * (2**32 - 1) / np.max(float_img)).astype(np.uint32)
+        # load in the corresponding float and integer images
+        float_img: np.ndarray = io.imread(
+            extraction_dir / "float" / f"{str(row.lib_mz).replace('.', '_')}.tiff"
+        )
+        integer_img: np.ndarray = io.imread(
+            extraction_dir / "int" / f"{str(row.lib_mz).replace('.', '_')}.tiff"
+        )
 
         img_name: str = row.composition
 
diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb
index 3bffb46..a6d19ce 100644
--- a/templates/maldi-pipeline.ipynb
+++ b/templates/maldi-pipeline.ipynb
@@ -466,18 +466,7 @@
    },
    "outputs": [],
    "source": [
-    "image_data = extraction.coordinate_integration(peak_df=peak_df, imz_data=imz_data)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "image_data"
+    "extraction.coordinate_integration(peak_df=peak_df, imz_data=imz_data)"
    ]
   },
   {
@@ -509,18 +498,7 @@
    },
    "outputs": [],
    "source": [
-    "image_data.sel(peak=[desired_peak_hist], method=\"nearest\").plot.hist(bins=bin_count)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "plotting.save_peak_images(image_xr=image_data, extraction_dir=extraction_dir)"
+    "plotting.plot_peak_hist(peak=desired_peak_hist, bin_count=bin_count)"
    ]
   },
   {
@@ -557,7 +535,7 @@
    "outputs": [],
    "source": [
     "matched_peaks_df = extraction.library_matching(\n",
-    "    image_xr=image_data, library_peak_df=library_peak_df, ppm=ppm, extraction_dir=extraction_dir\n",
+    "    library_peak_df=library_peak_df, ppm=ppm, extraction_dir=extraction_dir\n",
     ")"
    ]
   },
@@ -570,16 +548,9 @@
    "outputs": [],
    "source": [
     "plotting.save_matched_peak_images(\n",
-    "    image_xr=image_data, matched_peaks_df=matched_peaks_df, extraction_dir=extraction_dir\n",
+    "    matched_peaks_df=matched_peaks_df, extraction_dir=extraction_dir\n",
     ")"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -598,7 +569,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.1"
+   "version": "3.11.6"
   },
   "vscode": {
    "interpreter": {

From e5029b4680d6cdaedb3ec087863014dcb6476248 Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Tue, 12 Nov 2024 13:44:49 -0800
Subject: [PATCH 02/16] Fix mypy check issues

---
 src/maldi_tools/extraction.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py
index b23841f..2a6dee2 100644
--- a/src/maldi_tools/extraction.py
+++ b/src/maldi_tools/extraction.py
@@ -12,11 +12,12 @@
 from functools import partial
 from operator import itemgetter
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
 import pandas as pd
 import xarray as xr
+from alpineer.image_utils import save_image
 from alpineer.io_utils import list_files, remove_file_extensions, validate_paths
 from alpineer.misc_utils import verify_in_list
 from pyimzml.ImzMLParser import ImzMLParser
@@ -78,7 +79,7 @@ def rolling_window(
         total_mass_df (pd.DataFrame): A dataframe containing all the masses and their
             relative intensities.
         intensity_percentile (int): The intensity for the quantile calculation.
-        window_size (int, optional): The sizve of the window for the rolling window method.
+        window_size (int): The sizve of the window for the rolling window method.
             Defaults to 5000.
 
     Returns:
@@ -278,8 +279,8 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extract
                 np.uint32
             )
             img_name: str = f"{peak:.4f}".replace(".", "_")
-            image_utils.save_image(fname=extraction_dir / "float" / f"{img_name}.tiff", data=peak_img_float)
-            image_utils.save_image(fname=extraction_dir / "int" / f"{img_name}.tiff", data=peak_img_int)
+            save_image(fname=extraction_dir / "float" / f"{img_name}.tiff", data=peak_img_float)
+            save_image(fname=extraction_dir / "int" / f"{img_name}.tiff", data=peak_img_int)
 
 
 def _matching_vec(obs_mz: pd.Series, library_peak_df: pd.DataFrame, ppm: int) -> pd.Series:
@@ -325,7 +326,7 @@ def library_matching(
         ppm (int): The ppm for an acceptable mass error range between the observed mass and any target
         mass in the library.
         extraction_dir (Path): The directory to save extracted data in.
-        adducts (bool, optional): Add adducts together. Defaults to False. (Not implemented feature)
+        adducts (bool): Add adducts together. Defaults to False. (Not implemented feature)
 
     Returns:
     -------
@@ -334,7 +335,7 @@ def library_matching(
     """
     peak_list: List[float] = [
         float(p.replace("_", "."))
-        for p in io_utils.remove_file_extensions(io_utils.list_files(extraction_dir / "float"))
+        for p in remove_file_extensions(list_files(extraction_dir / "float"))
     ]
     peak_df = pd.DataFrame({"peak": np.array(peak_list)})
     match_fun = partial(_matching_vec, library_peak_df=library_peak_df, ppm=ppm)

From 79758f4cf870835a0f22d9b71014eb8a274bde8e Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Tue, 12 Nov 2024 14:33:00 -0800
Subject: [PATCH 03/16] More linting fixes

---
 src/maldi_tools/extraction.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py
index 2a6dee2..14dbef4 100644
--- a/src/maldi_tools/extraction.py
+++ b/src/maldi_tools/extraction.py
@@ -16,7 +16,6 @@
 
 import numpy as np
 import pandas as pd
-import xarray as xr
 from alpineer.image_utils import save_image
 from alpineer.io_utils import list_files, remove_file_extensions, validate_paths
 from alpineer.misc_utils import verify_in_list
@@ -334,8 +333,7 @@ def library_matching(
         or not, the composition name and the mass error if a match was found or not.
     """
     peak_list: List[float] = [
-        float(p.replace("_", "."))
-        for p in remove_file_extensions(list_files(extraction_dir / "float"))
+        float(p.replace("_", ".")) for p in remove_file_extensions(list_files(extraction_dir / "float"))
     ]
     peak_df = pd.DataFrame({"peak": np.array(peak_list)})
     match_fun = partial(_matching_vec, library_peak_df=library_peak_df, ppm=ppm)

From d16906853a03a3e9ca7d8a50760af344d86c3057 Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Tue, 12 Nov 2024 15:47:57 -0800
Subject: [PATCH 04/16] Update testing workflow to reflect new peak saving
 scheme

---
 tests/extraction_test.py | 16 +++++++++++-----
 tests/plotting_test.py   |  3 ++-
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/tests/extraction_test.py b/tests/extraction_test.py
index e2bd6f8..fe92ba5 100644
--- a/tests/extraction_test.py
+++ b/tests/extraction_test.py
@@ -14,7 +14,7 @@
 from pytest import TempPathFactory
 from skimage.io import imread
 
-from maldi_tools import extraction
+from maldi_tools import extraction, plotting
 
 
 def test_extract_spectra(imz_data: ImzMLParser) -> None:
@@ -100,11 +100,16 @@ def test_peak_spectra(
         assert os.path.exists(debug_dir / f"{peak.peak:.4f}.png".replace(".", "_", 1))
 
 
-def test_coordinate_integration(imz_data, peak_widths):
+def test_coordinate_integration(imz_data, peak_widths, tmp_path: pathlib.Path):
     peak_df, *_ = peak_widths
-    img_data = extraction.coordinate_integration(peak_df=peak_df, imz_data=imz_data)
+    extraction_dir = tmp_path / "extraction_dir"
+    extraction.coordinate_integration(
+        peak_df=peak_df, imz_data=imz_data, extraction_dir=extraction_dir
+    )
 
     # Make sure the shape of any given image is correct.
+    test_peak_img = list_files(extraction_dir)[0]
+    img_data = imread(extraction_dir / test_peak_img)
     assert img_data.shape[1:] == (10, 10)
 
 
@@ -124,12 +129,13 @@ def test__matching_vec(library: pd.DataFrame, obs_mz: int, true_values: pd.Serie
 
 
 @pytest.mark.parametrize(argnames="_ppm", argvalues=[99])
-def test_library_matching(image_xr: xr.DataArray, library: pd.DataFrame, _ppm: int, tmp_path: pathlib.Path):
+def test_library_matching(library: pd.DataFrame, _ppm: int, tmp_path: pathlib.Path):
     extraction_dir = tmp_path / "extraction_dir"
     extraction_dir.mkdir(parents=True, exist_ok=True)
+    plotting.save_peak_images(image_xr, extraction_dir)
 
     peak_df: pd.DataFrame = extraction.library_matching(
-        image_xr=image_xr, library_peak_df=library, ppm=_ppm, extraction_dir=extraction_dir
+        library_peak_df=library, ppm=_ppm, extraction_dir=extraction_dir
     )
 
     for idx, row in enumerate(peak_df.itertuples()):
diff --git a/tests/plotting_test.py b/tests/plotting_test.py
index e6ea52f..7fcb0ee 100644
--- a/tests/plotting_test.py
+++ b/tests/plotting_test.py
@@ -85,11 +85,12 @@ def test_save_peak_images(image_xr: xr.DataArray, tmp_path: pathlib.Path):
 def test_save_matched_peak_images(rng: np.random.Generator, image_xr: xr.DataArray, tmp_path: pathlib.Path):
     extraction_dir = tmp_path / "extraction_dir"
     extraction_dir.mkdir(parents=True, exist_ok=True)
+    plotting.save_peak_images(image_xr, extraction_dir)
 
     matched_peaks_df = pd.DataFrame(data={"composition": rng.random(size=(3,))})
 
     plotting.save_matched_peak_images(
-        image_xr=image_xr, matched_peaks_df=matched_peaks_df, extraction_dir=extraction_dir
+        matched_peaks_df=matched_peaks_df, extraction_dir=extraction_dir
     )
 
     for peak in matched_peaks_df.itertuples():

From 0b0975f92d29e446a665f1ba0564ccacd1f71522 Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Tue, 12 Nov 2024 15:51:59 -0800
Subject: [PATCH 05/16] Fix more tests

---
 tests/extraction_test.py | 8 ++++----
 tests/plotting_test.py   | 4 +---
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/tests/extraction_test.py b/tests/extraction_test.py
index fe92ba5..c8dadd5 100644
--- a/tests/extraction_test.py
+++ b/tests/extraction_test.py
@@ -103,9 +103,7 @@ def test_peak_spectra(
 def test_coordinate_integration(imz_data, peak_widths, tmp_path: pathlib.Path):
     peak_df, *_ = peak_widths
     extraction_dir = tmp_path / "extraction_dir"
-    extraction.coordinate_integration(
-        peak_df=peak_df, imz_data=imz_data, extraction_dir=extraction_dir
-    )
+    extraction.coordinate_integration(peak_df=peak_df, imz_data=imz_data, extraction_dir=extraction_dir)
 
     # Make sure the shape of any given image is correct.
     test_peak_img = list_files(extraction_dir)[0]
@@ -129,7 +127,9 @@ def test__matching_vec(library: pd.DataFrame, obs_mz: int, true_values: pd.Serie
 
 
 @pytest.mark.parametrize(argnames="_ppm", argvalues=[99])
-def test_library_matching(library: pd.DataFrame, _ppm: int, tmp_path: pathlib.Path):
+def test_library_matching(
+    library: pd.DataFrame, _ppm: image_xr: xr.DataArray, int, tmp_path: pathlib.Path
+):
     extraction_dir = tmp_path / "extraction_dir"
     extraction_dir.mkdir(parents=True, exist_ok=True)
     plotting.save_peak_images(image_xr, extraction_dir)
diff --git a/tests/plotting_test.py b/tests/plotting_test.py
index 7fcb0ee..2893b77 100644
--- a/tests/plotting_test.py
+++ b/tests/plotting_test.py
@@ -89,9 +89,7 @@ def test_save_matched_peak_images(rng: np.random.Generator, image_xr: xr.DataArr
 
     matched_peaks_df = pd.DataFrame(data={"composition": rng.random(size=(3,))})
 
-    plotting.save_matched_peak_images(
-        matched_peaks_df=matched_peaks_df, extraction_dir=extraction_dir
-    )
+    plotting.save_matched_peak_images(matched_peaks_df=matched_peaks_df, extraction_dir=extraction_dir)
 
     for peak in matched_peaks_df.itertuples():
         # Assert that the float and integer images are saved.

From dec2c208a33f987a04ff7da5f41c0aa8163302f2 Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Wed, 13 Nov 2024 09:48:12 -0800
Subject: [PATCH 06/16] Fix test syntax

---
 tests/extraction_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/extraction_test.py b/tests/extraction_test.py
index c8dadd5..7d0f5a2 100644
--- a/tests/extraction_test.py
+++ b/tests/extraction_test.py
@@ -128,7 +128,7 @@ def test__matching_vec(library: pd.DataFrame, obs_mz: int, true_values: pd.Serie
 
 @pytest.mark.parametrize(argnames="_ppm", argvalues=[99])
 def test_library_matching(
-    library: pd.DataFrame, _ppm: image_xr: xr.DataArray, int, tmp_path: pathlib.Path
+    library: pd.DataFrame, image_xr: xr.DataArray, _ppm: int, tmp_path: pathlib.Path
 ):
     extraction_dir = tmp_path / "extraction_dir"
     extraction_dir.mkdir(parents=True, exist_ok=True)

From 0fec7436521004ac58df2437d36fb0bf48658ba9 Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Wed, 13 Nov 2024 10:03:13 -0800
Subject: [PATCH 07/16] Formatting of extraction test

---
 tests/extraction_test.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/extraction_test.py b/tests/extraction_test.py
index 7d0f5a2..be365cc 100644
--- a/tests/extraction_test.py
+++ b/tests/extraction_test.py
@@ -127,9 +127,7 @@ def test__matching_vec(library: pd.DataFrame, obs_mz: int, true_values: pd.Serie
 
 
 @pytest.mark.parametrize(argnames="_ppm", argvalues=[99])
-def test_library_matching(
-    library: pd.DataFrame, image_xr: xr.DataArray, _ppm: int, tmp_path: pathlib.Path
-):
+def test_library_matching(library: pd.DataFrame, image_xr: xr.DataArray, _ppm: int, tmp_path: pathlib.Path):
     extraction_dir = tmp_path / "extraction_dir"
     extraction_dir.mkdir(parents=True, exist_ok=True)
     plotting.save_peak_images(image_xr, extraction_dir)

From 29c2b230c97cfb824257ba98dfabbeae7ee4adf7 Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@stanford.edu>
Date: Wed, 20 Nov 2024 12:33:24 -0800
Subject: [PATCH 08/16] Update coordinate integration test to verify actual
 peak finding and saving

---
 conftest.py                   | 55 +++++++++++++++++++++++++++++++----
 src/maldi_tools/extraction.py |  9 ++++--
 tests/extraction_test.py      | 45 ++++++++++++++++++----------
 3 files changed, 86 insertions(+), 23 deletions(-)

diff --git a/conftest.py b/conftest.py
index 484208c..8a94385 100644
--- a/conftest.py
+++ b/conftest.py
@@ -3,7 +3,7 @@
 import json
 import os
 from pathlib import Path
-from typing import Generator, List
+from typing import Generator, List, Tuple
 
 import numpy as np
 import pandas as pd
@@ -53,11 +53,44 @@ def imz_data(tmp_path_factory: TempPathFactory, rng: np.random.Generator) -> Imz
     yield ImzMLParser(filename=output_file_name)
 
 
+@pytest.fixture(scope="session")
+def imz_data_coord_int(tmp_path_factory: TempPathFactory, rng: np.random.Generator) -> ImzMLParser:
+    # Simplify the previous process for a single coordinate image (1x1)
+    img_dim: int = 1
+
+    # Generate random integers n for each coordinate (1 x 1). These will be used for creating
+    # random m/z and intensity values of length n.
+    # Lengths n are distributed along the standard gamma.
+    ns: np.ndarray = np.rint(rng.standard_gamma(shape=2.5, size=(img_dim**2)) * 100).astype(int)
+
+    # Generate random masses and sample different amounts of them, so we get duplicates
+    total_mzs: np.ndarray = (10000 - 100) * rng.random(size=img_dim**2 * 2) + 100
+
+    coords = [(x, y, 1) for x in range(1, img_dim + 1) for y in range(1, img_dim + 1)]
+
+    output_file_name: Path = tmp_path_factory.mktemp("data") / "test_data.imzML"
+
+    with ImzMLWriter(output_filename=output_file_name, mode="processed") as imzml:
+        for coord, n in zip(coords, ns):
+            # Masses: 100 <= mz < 10000, of length n, sampled randomly
+            mzs = rng.choice(a=total_mzs, size=n)
+
+            # Intensities: 0 <= int < 1e8, of length n
+            ints: np.ndarray = rng.exponential(size=n)
+
+            imzml.addSpectrum(mzs=mzs, intensities=ints, coords=coord)
+
+    yield ImzMLParser(filename=output_file_name)
+
+
 @pytest.fixture(scope="session")
 def total_mass_df(rng: np.random.Generator) -> pd.DataFrame:
     mz_count: int = 10000
     df = pd.DataFrame(
-        data={"m/z": np.linspace(start=1, stop=101, num=mz_count), "intensity": rng.random(size=mz_count)}
+        data={
+            "m/z": np.linspace(start=1, stop=101, num=mz_count),
+            "intensity": rng.random(size=mz_count)
+        }
     )
     yield df
 
@@ -75,8 +108,8 @@ def percentile_intensities(
 
 @pytest.fixture(scope="session")
 def peak_idx_candidates(
-    total_mass_df: pd.DataFrame, percentile_intensities: tuple[np.ndarray, np.ndarray]
-) -> Generator[tuple[np.ndarray, np.ndarray], None, None]:
+    total_mass_df: pd.DataFrame, percentile_intensities: Tuple[np.ndarray, np.ndarray]
+) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]:
     _, log_int_percentile = percentile_intensities
 
     peak_candidate_indexes, peak_candidates = extraction.signal_extraction(
@@ -87,8 +120,8 @@ def peak_idx_candidates(
 
 @pytest.fixture(scope="session")
 def peak_widths(
-    total_mass_df, peak_idx_candidates
-) -> Generator[tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray], None, None]:
+    total_mass_df: pd.DataFrame, peak_idx_candidates: Tuple[np.ndarray, np.ndarray]
+) -> Generator[Tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray], None, None]:
     peak_candidate_idxs, peak_candidates = peak_idx_candidates
     peak_df, l_ips_r, r_ips_r, peak_widths_height = extraction.get_peak_widths(
         total_mass_df=total_mass_df,
@@ -100,6 +133,16 @@ def peak_widths(
     yield (peak_df, l_ips_r, r_ips_r, peak_widths_height)
 
 
+@pytest.fixture(scope="session")
+def peak_widths_coord_int(imz_data_coord_int: ImzMLParser):
+    mzs, intensities = imz_data_coord_int.getspectrum(0)
+    peak_df = pd.DataFrame({"m/z": mzs, "intensity": intensities})
+    peak_df["peak"] = (peak_df["m/z"] + 0.04).copy()
+    peak_df["peak_height"] = 0.001
+
+    yield peak_df
+
+
 @pytest.fixture(scope="session")
 def library() -> Generator[pd.DataFrame, None, None]:
     lib = pd.DataFrame(data={"mz": [30, 45], "composition": ["A", "B"]})
diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py
index 14dbef4..7e73f5c 100644
--- a/src/maldi_tools/extraction.py
+++ b/src/maldi_tools/extraction.py
@@ -254,7 +254,7 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extract
         extraction_dir (Path): The directory to save extracted data (peak images) in.
     """
     unique_peaks = peak_df["peak"].unique()
-    dict(zip(unique_peaks, np.arange((len(unique_peaks)))))
+    peak_dict = dict(zip(unique_peaks, np.arange((len(unique_peaks)))))
 
     imz_coordinates: list = imz_data.coordinates
 
@@ -267,11 +267,16 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extract
     os.makedirs(extraction_dir / "int")
 
     for idx, (x, y, _) in tqdm(enumerate(imz_data.coordinates), total=len(imz_data.coordinates)):
-        peak_img: np.ndarray = np.zeros((1, *image_shape))
         mzs, intensities = imz_data.getspectrum(idx)
         intensity: np.ndarray = intensities[np.isin(mzs, peak_df["m/z"])]
 
         for i_idx, peak in peak_df.loc[peak_df["m/z"].isin(mzs), "peak"].reset_index(drop=True).items():
+            img_name: str = f"{peak:.4f}".replace(".", "_")
+            if os.path.exists(extraction_dir / "float" / f"{img_name}.tiff"):
+                peak_img = imread(extraction_dir / "float" / f"{img_name}.tiff")
+            else:
+                peak_img: np.ndarray = np.zeros(image_shape)
+
             peak_img[x - 1, y - 1] += intensity[i_idx]
             peak_img_float: np.ndarray = peak_img.T
             peak_img_int: np.ndarray = (peak_img_float * (2**32 - 1) / np.max(peak_img_float)).astype(
diff --git a/tests/extraction_test.py b/tests/extraction_test.py
index be365cc..d30a033 100644
--- a/tests/extraction_test.py
+++ b/tests/extraction_test.py
@@ -2,13 +2,13 @@
 
 import os
 import pathlib
-from typing import List
+from typing import List, Tuple
 
 import numpy as np
 import pandas as pd
 import pytest
 import xarray as xr
-from alpineer.io_utils import list_files, remove_file_extensions
+from alpineer.io_utils import list_files, list_folders, remove_file_extensions
 from alpineer.misc_utils import verify_same_elements
 from pyimzml.ImzMLParser import ImzMLParser
 from pytest import TempPathFactory
@@ -41,7 +41,7 @@ def test_rolling_window(total_mass_df: pd.DataFrame) -> None:
 
 
 def test_signal_extraction(
-    total_mass_df: pd.DataFrame, percentile_intensities: tuple[np.ndarray, np.ndarray]
+    total_mass_df: pd.DataFrame, percentile_intensities: Tuple[np.ndarray, np.ndarray]
 ) -> None:
     _, log_int_percentile = percentile_intensities
     peak_candidate_indexes, peak_candidates = extraction.signal_extraction(
@@ -56,7 +56,7 @@ def test_signal_extraction(
     assert np.all(peak_candidates[1:] >= peak_candidates[:-1])
 
 
-def test_get_peak_widths(total_mass_df: pd.DataFrame, peak_idx_candidates: tuple[np.ndarray, np.ndarray]):
+def test_get_peak_widths(total_mass_df: pd.DataFrame, peak_idx_candidates: Tuple[np.ndarray, np.ndarray]):
     peak_candidate_idxs, peak_candidates = peak_idx_candidates
     peak_df, l_ips_r, r_ips_r, peak_widths_height = extraction.get_peak_widths(
         total_mass_df=total_mass_df,
@@ -73,8 +73,8 @@ def test_get_peak_widths(total_mass_df: pd.DataFrame, peak_idx_candidates: tuple
 
 def test_peak_spectra(
     total_mass_df: pd.DataFrame,
-    peak_idx_candidates: tuple[np.ndarray, np.ndarray],
-    peak_widths: tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray],
+    peak_idx_candidates: Tuple[np.ndarray, np.ndarray],
+    peak_widths: Tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray],
     tmp_path: pathlib.Path,
 ):
     debug_dir = tmp_path / "debug_dir"
@@ -100,15 +100,29 @@ def test_peak_spectra(
         assert os.path.exists(debug_dir / f"{peak.peak:.4f}.png".replace(".", "_", 1))
 
 
-def test_coordinate_integration(imz_data, peak_widths, tmp_path: pathlib.Path):
-    peak_df, *_ = peak_widths
+def test_coordinate_integration(
+    imz_data_coord_int: ImzMLParser,
+    peak_widths_coord_int: pd.DataFrame,
+    image_xr: xr.DataArray,
+    tmp_path: pathlib.Path
+):
+    # peak_df, *_ = peak_widths
     extraction_dir = tmp_path / "extraction_dir"
-    extraction.coordinate_integration(peak_df=peak_df, imz_data=imz_data, extraction_dir=extraction_dir)
 
-    # Make sure the shape of any given image is correct.
-    test_peak_img = list_files(extraction_dir)[0]
-    img_data = imread(extraction_dir / test_peak_img)
-    assert img_data.shape[1:] == (10, 10)
+    extraction.coordinate_integration(
+        peak_df=peak_widths_coord_int,
+        imz_data=imz_data_coord_int,
+        extraction_dir=extraction_dir
+    )
+
+    # Make sure the shape of any given image is correct for both float and int
+    test_float_peak_img = list_files(extraction_dir / "float")[0]
+    float_img_data = imread(extraction_dir / "float" / test_float_peak_img)
+    assert float_img_data.shape == (1, 1)
+
+    test_int_peak_img = list_files(extraction_dir / "int")[0]
+    int_img_data = imread(extraction_dir / "int" / test_int_peak_img)
+    assert int_img_data.shape == (1, 1)
 
 
 @pytest.mark.parametrize(
@@ -137,15 +151,16 @@ def test_library_matching(library: pd.DataFrame, image_xr: xr.DataArray, _ppm: i
     )
 
     for idx, row in enumerate(peak_df.itertuples()):
-        if idx < 4:
+        if row.peak not in {30.0, 45.0}:
             assert row.matched is False
             assert np.isnan(row.composition)
             assert np.isnan(row.mass_error)
             assert np.isnan(row.lib_mz)
         else:
+            assert row.matched is True
             assert row.mass_error == 0
             assert row.composition in {"A", "B"}
-            assert row.peak in {30, 45}
+            assert row.lib_mz in {30.0, 45.0}
 
 
 def test_generate_glycan_mask(

From a79c0d871ede665a16a02f508a88eaa42e6baa71 Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@stanford.edu>
Date: Wed, 20 Nov 2024 12:52:21 -0800
Subject: [PATCH 09/16] Ensure float image gets re-transposed if loading back

---
 conftest.py                   |  5 +----
 src/maldi_tools/extraction.py | 16 ++++++----------
 tests/extraction_test.py      |  8 +++-----
 3 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/conftest.py b/conftest.py
index 8a94385..fa50c69 100644
--- a/conftest.py
+++ b/conftest.py
@@ -87,10 +87,7 @@ def imz_data_coord_int(tmp_path_factory: TempPathFactory, rng: np.random.Generat
 def total_mass_df(rng: np.random.Generator) -> pd.DataFrame:
     mz_count: int = 10000
     df = pd.DataFrame(
-        data={
-            "m/z": np.linspace(start=1, stop=101, num=mz_count),
-            "intensity": rng.random(size=mz_count)
-        }
+        data={"m/z": np.linspace(start=1, stop=101, num=mz_count), "intensity": rng.random(size=mz_count)}
     )
     yield df
 
diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py
index 7e73f5c..e849f89 100644
--- a/src/maldi_tools/extraction.py
+++ b/src/maldi_tools/extraction.py
@@ -253,9 +253,6 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extract
         imz_data (ImzMLParser): The imzML object.
         extraction_dir (Path): The directory to save extracted data (peak images) in.
     """
-    unique_peaks = peak_df["peak"].unique()
-    peak_dict = dict(zip(unique_peaks, np.arange((len(unique_peaks)))))
-
     imz_coordinates: list = imz_data.coordinates
 
     x_size: int = max(imz_coordinates, key=itemgetter(0))[0]
@@ -272,19 +269,18 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extract
 
         for i_idx, peak in peak_df.loc[peak_df["m/z"].isin(mzs), "peak"].reset_index(drop=True).items():
             img_name: str = f"{peak:.4f}".replace(".", "_")
-            if os.path.exists(extraction_dir / "float" / f"{img_name}.tiff"):
-                peak_img = imread(extraction_dir / "float" / f"{img_name}.tiff")
-            else:
-                peak_img: np.ndarray = np.zeros(image_shape)
+            float_peak_path: Path = extraction_dir / "float" / f"{img_name}.tiff"
+            int_peak_path: Path = extraction_dir / "int" / f"{img_name}.tiff"
+            peak_exists: bool = os.path.exists(float_peak_path)
+            peak_img: np.ndarray = imread(float_peak_path).T if peak_exists else np.zeros(image_shape)
 
             peak_img[x - 1, y - 1] += intensity[i_idx]
             peak_img_float: np.ndarray = peak_img.T
             peak_img_int: np.ndarray = (peak_img_float * (2**32 - 1) / np.max(peak_img_float)).astype(
                 np.uint32
             )
-            img_name: str = f"{peak:.4f}".replace(".", "_")
-            save_image(fname=extraction_dir / "float" / f"{img_name}.tiff", data=peak_img_float)
-            save_image(fname=extraction_dir / "int" / f"{img_name}.tiff", data=peak_img_int)
+            save_image(fname=float_peak_path, data=peak_img_float)
+            save_image(fname=int_peak_path, data=peak_img_int)
 
 
 def _matching_vec(obs_mz: pd.Series, library_peak_df: pd.DataFrame, ppm: int) -> pd.Series:
diff --git a/tests/extraction_test.py b/tests/extraction_test.py
index d30a033..e703a28 100644
--- a/tests/extraction_test.py
+++ b/tests/extraction_test.py
@@ -8,7 +8,7 @@
 import pandas as pd
 import pytest
 import xarray as xr
-from alpineer.io_utils import list_files, list_folders, remove_file_extensions
+from alpineer.io_utils import list_files, remove_file_extensions
 from alpineer.misc_utils import verify_same_elements
 from pyimzml.ImzMLParser import ImzMLParser
 from pytest import TempPathFactory
@@ -104,15 +104,13 @@ def test_coordinate_integration(
     imz_data_coord_int: ImzMLParser,
     peak_widths_coord_int: pd.DataFrame,
     image_xr: xr.DataArray,
-    tmp_path: pathlib.Path
+    tmp_path: pathlib.Path,
 ):
     # peak_df, *_ = peak_widths
     extraction_dir = tmp_path / "extraction_dir"
 
     extraction.coordinate_integration(
-        peak_df=peak_widths_coord_int,
-        imz_data=imz_data_coord_int,
-        extraction_dir=extraction_dir
+        peak_df=peak_widths_coord_int, imz_data=imz_data_coord_int, extraction_dir=extraction_dir
     )
 
     # Make sure the shape of any given image is correct for both float and int

From 2e73b8d513b55e5886731410d21b7844fa800484 Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Wed, 20 Nov 2024 14:59:52 -0800
Subject: [PATCH 10/16] Add more robust testing for library matching and saving
 out matched peak images

---
 src/maldi_tools/plotting.py | 22 ++++++++++-----------
 tests/plotting_test.py      | 38 +++++++++++++++++++++++++++++++++----
 2 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/src/maldi_tools/plotting.py b/src/maldi_tools/plotting.py
index d6ef36a..7dee2ec 100644
--- a/src/maldi_tools/plotting.py
+++ b/src/maldi_tools/plotting.py
@@ -193,18 +193,16 @@ def save_matched_peak_images(
     matched_peaks_df_filtered: pd.DataFrame = matched_peaks_df.dropna()
 
     for row in tqdm(matched_peaks_df_filtered.itertuples(), total=len(matched_peaks_df_filtered)):
-        # load in the corresponding float and integer images
-        float_img: np.ndarray = io.imread(
-            extraction_dir / "float" / f"{str(row.lib_mz).replace('.', '_')}.tiff"
-        )
-        integer_img: np.ndarray = io.imread(
-            extraction_dir / "int" / f"{str(row.lib_mz).replace('.', '_')}.tiff"
-        )
+        if row.matched is True:
+            peak_file_name: str = f"{row.lib_mz:.4f}".replace(".", "_") + ".tiff"
+            # load in the corresponding float and integer images
+            float_img: np.ndarray = io.imread(extraction_dir / "float" / peak_file_name)
+            integer_img: np.ndarray = io.imread(extraction_dir / "int" / peak_file_name)
 
-        img_name: str = row.composition
+            img_name: str = row.composition
 
-        # save floating point image
-        image_utils.save_image(fname=float_dir / f"{img_name}.tiff", data=float_img)
+            # save floating point image
+            image_utils.save_image(fname=float_dir / f"{img_name}.tiff", data=float_img)
 
-        # save integer image
-        image_utils.save_image(fname=int_dir / f"{img_name}.tiff", data=integer_img)
+            # save integer image
+            image_utils.save_image(fname=int_dir / f"{img_name}.tiff", data=integer_img)
diff --git a/tests/plotting_test.py b/tests/plotting_test.py
index 2893b77..279f37f 100644
--- a/tests/plotting_test.py
+++ b/tests/plotting_test.py
@@ -87,11 +87,41 @@ def test_save_matched_peak_images(rng: np.random.Generator, image_xr: xr.DataArr
     extraction_dir.mkdir(parents=True, exist_ok=True)
     plotting.save_peak_images(image_xr, extraction_dir)
 
-    matched_peaks_df = pd.DataFrame(data={"composition": rng.random(size=(3,))})
+    peaks = image_xr.peak.values
+    img_shape = (image_xr.shape[1], image_xr.shape[2])
+    matched = [False] * len(peaks)
+    matched[-1] = True
+    composition = [np.nan] * len(peaks)
+    composition[-1] = rng.random(size=(1,))
+    mass_error = [np.nan] * len(peaks)
+    mass_error[-1] = rng.random(size=(1,))
+    matched_peaks_df = pd.DataFrame(
+        data={
+            "lib_mz": peaks,
+            "matched": matched,
+            "composition": composition,
+            "mass_error": mass_error,
+        }
+    )
 
     plotting.save_matched_peak_images(matched_peaks_df=matched_peaks_df, extraction_dir=extraction_dir)
 
     for peak in matched_peaks_df.itertuples():
-        # Assert that the float and integer images are saved.
-        assert os.path.exists(extraction_dir / "library_matched" / "float" / f"{peak.composition}.tiff")
-        assert os.path.exists(extraction_dir / "library_matched" / "int" / f"{peak.composition}.tiff")
+        float_peak_path = extraction_dir / "library_matched" / "float" / f"{peak.composition}.tiff"
+        int_peak_path = extraction_dir / "library_matched" / "int" / f"{peak.composition}.tiff"
+
+        # Assert that the float and integer images are saved for all matched peaks.
+        # Check that the peak images match the desired shape
+        if peak.matched is True:
+            assert os.path.exists(float_peak_path)
+            assert os.path.exists(int_peak_path)
+
+            matched_float_img = imread(float_peak_path)
+            matched_int_img = imread(int_peak_path)
+
+            assert matched_float_img.shape == img_shape
+            assert matched_int_img.shape == img_shape
+        # Otherwise, ensure peaks are not saved
+        else:
+            assert not os.path.exists(float_peak_path)
+            assert not os.path.exists(int_peak_path)

From 31de1a23458cd67f9d2ce1e3d9b91de7dc0cf2ea Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Wed, 20 Nov 2024 15:05:08 -0800
Subject: [PATCH 11/16] Update documentation and function call to
 coordinate_integration to match changes to workflow

---
 templates/maldi-pipeline.ipynb | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb
index 31be7e0..27e2318 100644
--- a/templates/maldi-pipeline.ipynb
+++ b/templates/maldi-pipeline.ipynb
@@ -433,7 +433,7 @@
    "source": [
     "## Integrate Coordinates\n",
     "\n",
-    "Generate the images and save them in an *xarray*, where the dimensions are: Image (indexed by peak value), $x$, and $y$."
+    "Generate the images and save them as TIFFs in `extraction_dir`. Each file is named after their corresponding peak m/z value, truncated to 4 decimal places. The dimensions of each image correspond to the maximum x- and y-coordinates extracted from the slide."
    ]
   },
   {
@@ -444,7 +444,7 @@
    },
    "outputs": [],
    "source": [
-    "extraction.coordinate_integration(peak_df=peak_df, imz_data=imz_data)"
+    "extraction.coordinate_integration(peak_df=peak_df, imz_data=imz_data, extraction_dir=extraction_dir)"
    ]
   },
   {
@@ -490,7 +490,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Constants"
+    "Set a value for the maximum ppm tolerance between a peak and its corresponding match in the `library_peak_df` specified. Matched peak images are saved as TIFFs to the `library_matched` subfolder inside `extraction_dir` and are named after their matched peak m/z value."
    ]
   },
   {
@@ -677,7 +677,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.6"
+   "version": "3.11.10"
   },
   "vscode": {
    "interpreter": {

From 873fed59bf40c34cab7b13cae25bb06a09aafde7 Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Tue, 26 Nov 2024 10:23:58 -0800
Subject: [PATCH 12/16] Ensure extraction_dir is absolutely converted to a Path
 type

---
 src/maldi_tools/extraction.py | 10 ++++++----
 src/maldi_tools/plotting.py   |  8 ++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py
index e849f89..b5c426e 100644
--- a/src/maldi_tools/extraction.py
+++ b/src/maldi_tools/extraction.py
@@ -260,8 +260,10 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extract
 
     image_shape: Tuple[int, int] = (x_size, y_size)
 
-    os.makedirs(extraction_dir / "float")
-    os.makedirs(extraction_dir / "int")
+    float_peak_dir: Path = Path(extraction_dir) / "float"
+    int_peak_dir: Path = Path(extraction_dir) / "int"
+    os.makedirs(float_peak_dir)
+    os.makedirs(int_peak_dir)
 
     for idx, (x, y, _) in tqdm(enumerate(imz_data.coordinates), total=len(imz_data.coordinates)):
         mzs, intensities = imz_data.getspectrum(idx)
@@ -269,8 +271,8 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extract
 
         for i_idx, peak in peak_df.loc[peak_df["m/z"].isin(mzs), "peak"].reset_index(drop=True).items():
             img_name: str = f"{peak:.4f}".replace(".", "_")
-            float_peak_path: Path = extraction_dir / "float" / f"{img_name}.tiff"
-            int_peak_path: Path = extraction_dir / "int" / f"{img_name}.tiff"
+            float_peak_path: Path = float_peak_dir / f"{img_name}.tiff"
+            int_peak_path: Path = int_peak_dir / f"{img_name}.tiff"
             peak_exists: bool = os.path.exists(float_peak_path)
             peak_img: np.ndarray = imread(float_peak_path).T if peak_exists else np.zeros(image_shape)
 
diff --git a/src/maldi_tools/plotting.py b/src/maldi_tools/plotting.py
index 7dee2ec..07706ea 100644
--- a/src/maldi_tools/plotting.py
+++ b/src/maldi_tools/plotting.py
@@ -184,8 +184,8 @@ def save_matched_peak_images(
         extraction_dir (Path): The directory to save extracted data in.
     """
     # Create image directories if they do not exist
-    float_dir: Path = extraction_dir / "library_matched" / "float"
-    int_dir: Path = extraction_dir / "library_matched" / "int"
+    float_dir: Path = Path(extraction_dir) / "library_matched" / "float"
+    int_dir: Path = Path(extraction_dir) / "library_matched" / "int"
     for img_dir in [float_dir, int_dir]:
         if not os.path.exists(img_dir):
             img_dir.mkdir(parents=True, exist_ok=True)
@@ -196,8 +196,8 @@ def save_matched_peak_images(
         if row.matched is True:
             peak_file_name: str = f"{row.lib_mz:.4f}".replace(".", "_") + ".tiff"
             # load in the corresponding float and integer images
-            float_img: np.ndarray = io.imread(extraction_dir / "float" / peak_file_name)
-            integer_img: np.ndarray = io.imread(extraction_dir / "int" / peak_file_name)
+            float_img: np.ndarray = io.imread(Path(extraction_dir) / "float" / peak_file_name)
+            integer_img: np.ndarray = io.imread(Path(extraction_dir) / "int" / peak_file_name)
 
             img_name: str = row.composition
 

From d240478c1cecee14dede4f32296312814b0f458b Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Tue, 26 Nov 2024 10:41:10 -0800
Subject: [PATCH 13/16] Ensure peaks can be visualized properly

---
 src/maldi_tools/plotting.py | 10 +++++++---
 tests/plotting_test.py      | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/src/maldi_tools/plotting.py b/src/maldi_tools/plotting.py
index 07706ea..51bd5dc 100644
--- a/src/maldi_tools/plotting.py
+++ b/src/maldi_tools/plotting.py
@@ -163,13 +163,17 @@ def plot_peak_hist(peak: float, bin_count: int, extraction_dir: Path) -> None:
         extraction_dir (Path): The directory the peak images are saved in
     """
     # verify that the peak provided exists
-    peak_path = extraction_dir / f"{str(peak).replace('.', '_')}.tiff"
+    peak_file: str = f"{peak:.4f}".replace(".", "_")
+    peak_file = peak_file + ".tiff"
+    peak_path = Path(extraction_dir) / "float" / peak_file
     if not os.path.exists(peak_path):
-        raise FileNotFoundError(f"Peak {peak} does not have a corresponding peak image in {extraction_dir}")
+        raise FileNotFoundError(
+            f"Peak {peak:.4f} does not have a corresponding peak image in {extraction_dir}"
+        )
 
     # load the peak image in and display histogram
     peak_img: np.ndarray = io.imread(peak_path)
-    plt.hist(peak_img.values, bins=bin_count)
+    plt.hist(peak_img, bins=bin_count)
 
 
 def save_matched_peak_images(
diff --git a/tests/plotting_test.py b/tests/plotting_test.py
index 279f37f..7ce0a64 100644
--- a/tests/plotting_test.py
+++ b/tests/plotting_test.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import pandas as pd
+import pytest
 import xarray as xr
 from skimage.io import imread
 
@@ -82,6 +83,23 @@ def test_save_peak_images(image_xr: xr.DataArray, tmp_path: pathlib.Path):
         assert os.path.exists(iname)
 
 
+def test_plot_peak_hist(image_xr: xr.DataArray, tmp_path: pathlib.Path):
+    extraction_dir = tmp_path / "extraction_dir"
+    extraction_dir.mkdir(parents=True, exist_ok=True)
+
+    # ensure the test actually truncates to 4 digits correctly
+    image_xr = image_xr.assign_coords(peak=np.random.rand(6) * 100)
+
+    plotting.save_peak_images(image_xr=image_xr, extraction_dir=extraction_dir)
+
+    # this test should run to completion, since the peak can be loaded
+    plotting.plot_peak_hist(peak=image_xr.peak.values[0], bin_count=30, extraction_dir=extraction_dir)
+
+    # this test should fail since the peak does not exist
+    with pytest.raises(FileNotFoundError):
+        plotting.plot_peak_hist(peak=50.0123, bin_count=30, extraction_dir=extraction_dir)
+
+
 def test_save_matched_peak_images(rng: np.random.Generator, image_xr: xr.DataArray, tmp_path: pathlib.Path):
     extraction_dir = tmp_path / "extraction_dir"
     extraction_dir.mkdir(parents=True, exist_ok=True)

From bcd217a72103eb35788261681062b613b2ef95a6 Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Tue, 26 Nov 2024 10:43:46 -0800
Subject: [PATCH 14/16] Ensure no failures if coordinate_integration run
 multiple times

---
 src/maldi_tools/extraction.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py
index b5c426e..47cea7c 100644
--- a/src/maldi_tools/extraction.py
+++ b/src/maldi_tools/extraction.py
@@ -262,8 +262,10 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extract
 
     float_peak_dir: Path = Path(extraction_dir) / "float"
     int_peak_dir: Path = Path(extraction_dir) / "int"
-    os.makedirs(float_peak_dir)
-    os.makedirs(int_peak_dir)
+    if not os.path.exists(float_peak_dir):
+        os.makedirs(float_peak_dir)
+    if not os.path.exists(int_peak_dir):
+        os.makedirs(int_peak_dir)
 
     for idx, (x, y, _) in tqdm(enumerate(imz_data.coordinates), total=len(imz_data.coordinates)):
         mzs, intensities = imz_data.getspectrum(idx)

From 42490353fe7700942e4c2ea8f1ec352739ac7bc0 Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Tue, 26 Nov 2024 10:44:31 -0800
Subject: [PATCH 15/16] Bring directory creation scheme in line with
 save_matched_peak_images

---
 src/maldi_tools/extraction.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py
index 47cea7c..f4d320a 100644
--- a/src/maldi_tools/extraction.py
+++ b/src/maldi_tools/extraction.py
@@ -262,10 +262,9 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extract
 
     float_peak_dir: Path = Path(extraction_dir) / "float"
     int_peak_dir: Path = Path(extraction_dir) / "int"
-    if not os.path.exists(float_peak_dir):
-        os.makedirs(float_peak_dir)
-    if not os.path.exists(int_peak_dir):
-        os.makedirs(int_peak_dir)
+    for img_dir in [float_peak_dir, int_peak_dir]:
+        if not os.path.exists(img_dir):
+            img_dir.mkdir(parents=True, exist_ok=True)
 
     for idx, (x, y, _) in tqdm(enumerate(imz_data.coordinates), total=len(imz_data.coordinates)):
         mzs, intensities = imz_data.getspectrum(idx)

From 25074c421e4d6a8ced2cc97c0862f1064fc75fe9 Mon Sep 17 00:00:00 2001
From: Alex Kong <alkong@ucdavis.edu>
Date: Tue, 26 Nov 2024 10:59:37 -0800
Subject: [PATCH 16/16] Make sure library matching loading uses Path correctly

---
 src/maldi_tools/extraction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py
index f4d320a..8b65910 100644
--- a/src/maldi_tools/extraction.py
+++ b/src/maldi_tools/extraction.py
@@ -337,7 +337,7 @@ def library_matching(
         or not, the composition name and the mass error if a match was found or not.
     """
     peak_list: List[float] = [
-        float(p.replace("_", ".")) for p in remove_file_extensions(list_files(extraction_dir / "float"))
+        float(p.replace("_", ".")) for p in remove_file_extensions(list_files(Path(extraction_dir) / "float"))
     ]
     peak_df = pd.DataFrame({"peak": np.array(peak_list)})
     match_fun = partial(_matching_vec, library_peak_df=library_peak_df, ppm=ppm)