From 8e1dbc6abb24fc5646b89439398c3b60d409dea3 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Wed, 28 Aug 2024 23:41:29 -0700 Subject: [PATCH 01/16] Update coordinate integration workflow to ensure memory efficiency --- src/maldi_tools/extraction.py | 45 +++++++++++++++++----------------- src/maldi_tools/plotting.py | 33 ++++++++++++++++++++----- templates/maldi-pipeline.ipynb | 39 ++++------------------------- 3 files changed, 55 insertions(+), 62 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index cab87dd..7f34371 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -10,11 +10,11 @@ from functools import partial from operator import itemgetter from pathlib import Path -from typing import Dict, Tuple +from typing import Dict, List, Tuple import numpy as np import pandas as pd -import xarray as xr +from alpineer import image_utils, io_utils from pyimzml.ImzMLParser import ImzMLParser from scipy import signal from tqdm.notebook import tqdm @@ -232,21 +232,20 @@ def peak_spectra( return panel_df -def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser) -> xr.DataArray: +def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extraction_dir: Path) -> None: """Integrates the coordinates with the discovered, post-processed peaks and generates an image for each of the peaks using the imzML coordinate data. + Saves the peak images to specified extraction_dir. + Args: ---- peak_df (pd.DataFrame): The unique peaks from the data. imz_data (ImzMLParser): The imzML object. - - Returns: - ------- - xr.DataArray: A data structure which holds all the images for each peak. + extraction_dir (Path): The directory to save extracted data (peak images) in. """ unique_peaks = peak_df["peak"].unique() - peak_dict = dict(zip(unique_peaks, np.arange((len(unique_peaks))))) + dict(zip(unique_peaks, np.arange((len(unique_peaks))))) imz_coordinates: list = imz_data.coordinates @@ -255,23 +254,23 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser) -> xr.D image_shape: Tuple[int, int] = (x_size, y_size) - imgs = np.zeros((len(unique_peaks), *image_shape)) + os.makedirs(extraction_dir / "float") + os.makedirs(extraction_dir / "int") for idx, (x, y, _) in tqdm(enumerate(imz_data.coordinates), total=len(imz_data.coordinates)): + peak_img: np.ndarray = np.zeros((1, *image_shape)) mzs, intensities = imz_data.getspectrum(idx) - intensity: np.ndarray = intensities[np.isin(mzs, peak_df["m/z"])] for i_idx, peak in peak_df.loc[peak_df["m/z"].isin(mzs), "peak"].reset_index(drop=True).items(): - imgs[peak_dict[peak], x - 1, y - 1] += intensity[i_idx] - - img_data = xr.DataArray( - data=imgs, - coords={"peak": unique_peaks, "x": range(x_size), "y": range(y_size)}, - dims=["peak", "x", "y"], - ) - - return img_data + peak_img[x - 1, y - 1] += intensity[i_idx] + peak_img_float: np.ndarray = peak_img.T + peak_img_int: np.ndarray = (peak_img_float * (2**32 - 1) / np.max(peak_img_float)).astype( + np.uint32 + ) + img_name: str = f"{peak:.4f}".replace(".", "_") + image_utils.save_image(fname=extraction_dir / "float" / f"{img_name}.tiff", data=peak_img_float) + image_utils.save_image(fname=extraction_dir / "int" / f"{img_name}.tiff", data=peak_img_int) def _matching_vec(obs_mz: pd.Series, library_peak_df: pd.DataFrame, ppm: int) -> pd.Series: @@ -301,7 +300,6 @@ def _matching_vec(obs_mz: pd.Series, library_peak_df: pd.DataFrame, ppm: int) -> def library_matching( - image_xr: xr.DataArray, library_peak_df: pd.DataFrame, ppm: int, extraction_dir: Path, @@ -312,7 +310,6 @@ def library_matching( Args: ---- - image_xr (xr.DataArray): A data structure which holds all the images for each peak. library_peak_df (pd.DataFrame): The library of interest to match the observed peaks with. ppm (int): The ppm for an acceptable mass error range between the observed mass and any target mass in the library. @@ -324,7 +321,11 @@ def library_matching( pd.DataFrame: Contains the peak, the library target mass, a boolean stating if a match was found or not, the composition name and the mass error if a match was found or not. """ - peak_df = pd.DataFrame({"peak": image_xr.peak.to_numpy()}) + peak_list: List[float] = [ + float(p.replace("_", ".")) + for p in io_utils.remove_file_extensions(io_utils.list_files(extraction_dir / "float")) + ] + peak_df = pd.DataFrame({"peak": np.array(peak_list)}) match_fun = partial(_matching_vec, library_peak_df=library_peak_df, ppm=ppm) peak_df[["lib_mz", "matched", "composition", "mass_error"]] = peak_df["peak"].apply( diff --git a/src/maldi_tools/plotting.py b/src/maldi_tools/plotting.py index 88c0d6c..d6ef36a 100644 --- a/src/maldi_tools/plotting.py +++ b/src/maldi_tools/plotting.py @@ -11,6 +11,7 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd +import skimage.io as io import xarray as xr from alpineer import image_utils from tqdm.notebook import tqdm @@ -152,8 +153,26 @@ def save_peak_images(image_xr: xr.DataArray, extraction_dir: Path) -> None: image_utils.save_image(fname=int_dir / f"{img_name}.tiff", data=integer_img) +def plot_peak_hist(peak: float, bin_count: int, extraction_dir: Path) -> None: + """Plot a histogram of the intensities of a provided peak image. + + Args: + ---- + peak (float): The desired peak to visualize + bin_count (int): The bin size to use for the histogram + extraction_dir (Path): The directory the peak images are saved in + """ + # verify that the peak provided exists + peak_path = extraction_dir / f"{str(peak).replace('.', '_')}.tiff" + if not os.path.exists(peak_path): + raise FileNotFoundError(f"Peak {peak} does not have a corresponding peak image in {extraction_dir}") + + # load the peak image in and display histogram + peak_img: np.ndarray = io.imread(peak_path) + plt.hist(peak_img.values, bins=bin_count) + + def save_matched_peak_images( - image_xr: xr.DataArray, matched_peaks_df: pd.DataFrame, extraction_dir: Path, ) -> None: @@ -161,7 +180,6 @@ def save_matched_peak_images( Args: ---- - image_xr (xr.DataArray): A data structure which holds all the images for each peak. matched_peaks_df (pd.DataFrame): A dataframe containing the peaks matched with the library. extraction_dir (Path): The directory to save extracted data in. """ @@ -175,10 +193,13 @@ def save_matched_peak_images( matched_peaks_df_filtered: pd.DataFrame = matched_peaks_df.dropna() for row in tqdm(matched_peaks_df_filtered.itertuples(), total=len(matched_peaks_df_filtered)): - image_index = row.Index - - float_img: np.ndarray = image_xr[image_index, ...].values.T - integer_img: np.ndarray = (float_img * (2**32 - 1) / np.max(float_img)).astype(np.uint32) + # load in the corresponding float and integer images + float_img: np.ndarray = io.imread( + extraction_dir / "float" / f"{str(row.lib_mz).replace('.', '_')}.tiff" + ) + integer_img: np.ndarray = io.imread( + extraction_dir / "int" / f"{str(row.lib_mz).replace('.', '_')}.tiff" + ) img_name: str = row.composition diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb index 3bffb46..a6d19ce 100644 --- a/templates/maldi-pipeline.ipynb +++ b/templates/maldi-pipeline.ipynb @@ -466,18 +466,7 @@ }, "outputs": [], "source": [ - "image_data = extraction.coordinate_integration(peak_df=peak_df, imz_data=imz_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "image_data" + "extraction.coordinate_integration(peak_df=peak_df, imz_data=imz_data)" ] }, { @@ -509,18 +498,7 @@ }, "outputs": [], "source": [ - "image_data.sel(peak=[desired_peak_hist], method=\"nearest\").plot.hist(bins=bin_count)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "plotting.save_peak_images(image_xr=image_data, extraction_dir=extraction_dir)" + "plotting.plot_peak_hist(peak=desired_peak_hist, bin_count=bin_count)" ] }, { @@ -557,7 +535,7 @@ "outputs": [], "source": [ "matched_peaks_df = extraction.library_matching(\n", - " image_xr=image_data, library_peak_df=library_peak_df, ppm=ppm, extraction_dir=extraction_dir\n", + " library_peak_df=library_peak_df, ppm=ppm, extraction_dir=extraction_dir\n", ")" ] }, @@ -570,16 +548,9 @@ "outputs": [], "source": [ "plotting.save_matched_peak_images(\n", - " image_xr=image_data, matched_peaks_df=matched_peaks_df, extraction_dir=extraction_dir\n", + " matched_peaks_df=matched_peaks_df, extraction_dir=extraction_dir\n", ")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -598,7 +569,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.1" + "version": "3.11.6" }, "vscode": { "interpreter": { From e5029b4680d6cdaedb3ec087863014dcb6476248 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Tue, 12 Nov 2024 13:44:49 -0800 Subject: [PATCH 02/16] Fix mypy check issues --- src/maldi_tools/extraction.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index b23841f..2a6dee2 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -12,11 +12,12 @@ from functools import partial from operator import itemgetter from pathlib import Path -from typing import Dict, List, Tuple +from typing import Dict, List, Optional, Tuple import numpy as np import pandas as pd import xarray as xr +from alpineer.image_utils import save_image from alpineer.io_utils import list_files, remove_file_extensions, validate_paths from alpineer.misc_utils import verify_in_list from pyimzml.ImzMLParser import ImzMLParser @@ -78,7 +79,7 @@ def rolling_window( total_mass_df (pd.DataFrame): A dataframe containing all the masses and their relative intensities. intensity_percentile (int): The intensity for the quantile calculation. - window_size (int, optional): The sizve of the window for the rolling window method. + window_size (int): The sizve of the window for the rolling window method. Defaults to 5000. Returns: @@ -278,8 +279,8 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extract np.uint32 ) img_name: str = f"{peak:.4f}".replace(".", "_") - image_utils.save_image(fname=extraction_dir / "float" / f"{img_name}.tiff", data=peak_img_float) - image_utils.save_image(fname=extraction_dir / "int" / f"{img_name}.tiff", data=peak_img_int) + save_image(fname=extraction_dir / "float" / f"{img_name}.tiff", data=peak_img_float) + save_image(fname=extraction_dir / "int" / f"{img_name}.tiff", data=peak_img_int) def _matching_vec(obs_mz: pd.Series, library_peak_df: pd.DataFrame, ppm: int) -> pd.Series: @@ -325,7 +326,7 @@ def library_matching( ppm (int): The ppm for an acceptable mass error range between the observed mass and any target mass in the library. extraction_dir (Path): The directory to save extracted data in. - adducts (bool, optional): Add adducts together. Defaults to False. (Not implemented feature) + adducts (bool): Add adducts together. Defaults to False. (Not implemented feature) Returns: ------- @@ -334,7 +335,7 @@ def library_matching( """ peak_list: List[float] = [ float(p.replace("_", ".")) - for p in io_utils.remove_file_extensions(io_utils.list_files(extraction_dir / "float")) + for p in remove_file_extensions(list_files(extraction_dir / "float")) ] peak_df = pd.DataFrame({"peak": np.array(peak_list)}) match_fun = partial(_matching_vec, library_peak_df=library_peak_df, ppm=ppm) From 79758f4cf870835a0f22d9b71014eb8a274bde8e Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Tue, 12 Nov 2024 14:33:00 -0800 Subject: [PATCH 03/16] More linting fixes --- src/maldi_tools/extraction.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index 2a6dee2..14dbef4 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -16,7 +16,6 @@ import numpy as np import pandas as pd -import xarray as xr from alpineer.image_utils import save_image from alpineer.io_utils import list_files, remove_file_extensions, validate_paths from alpineer.misc_utils import verify_in_list @@ -334,8 +333,7 @@ def library_matching( or not, the composition name and the mass error if a match was found or not. """ peak_list: List[float] = [ - float(p.replace("_", ".")) - for p in remove_file_extensions(list_files(extraction_dir / "float")) + float(p.replace("_", ".")) for p in remove_file_extensions(list_files(extraction_dir / "float")) ] peak_df = pd.DataFrame({"peak": np.array(peak_list)}) match_fun = partial(_matching_vec, library_peak_df=library_peak_df, ppm=ppm) From d16906853a03a3e9ca7d8a50760af344d86c3057 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Tue, 12 Nov 2024 15:47:57 -0800 Subject: [PATCH 04/16] Update testing workflow to reflect new peak saving scheme --- tests/extraction_test.py | 16 +++++++++++----- tests/plotting_test.py | 3 ++- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/extraction_test.py b/tests/extraction_test.py index e2bd6f8..fe92ba5 100644 --- a/tests/extraction_test.py +++ b/tests/extraction_test.py @@ -14,7 +14,7 @@ from pytest import TempPathFactory from skimage.io import imread -from maldi_tools import extraction +from maldi_tools import extraction, plotting def test_extract_spectra(imz_data: ImzMLParser) -> None: @@ -100,11 +100,16 @@ def test_peak_spectra( assert os.path.exists(debug_dir / f"{peak.peak:.4f}.png".replace(".", "_", 1)) -def test_coordinate_integration(imz_data, peak_widths): +def test_coordinate_integration(imz_data, peak_widths, tmp_path: pathlib.Path): peak_df, *_ = peak_widths - img_data = extraction.coordinate_integration(peak_df=peak_df, imz_data=imz_data) + extraction_dir = tmp_path / "extraction_dir" + extraction.coordinate_integration( + peak_df=peak_df, imz_data=imz_data, extraction_dir=extraction_dir + ) # Make sure the shape of any given image is correct. + test_peak_img = list_files(extraction_dir)[0] + img_data = imread(extraction_dir / test_peak_img) assert img_data.shape[1:] == (10, 10) @@ -124,12 +129,13 @@ def test__matching_vec(library: pd.DataFrame, obs_mz: int, true_values: pd.Serie @pytest.mark.parametrize(argnames="_ppm", argvalues=[99]) -def test_library_matching(image_xr: xr.DataArray, library: pd.DataFrame, _ppm: int, tmp_path: pathlib.Path): +def test_library_matching(library: pd.DataFrame, _ppm: int, tmp_path: pathlib.Path): extraction_dir = tmp_path / "extraction_dir" extraction_dir.mkdir(parents=True, exist_ok=True) + plotting.save_peak_images(image_xr, extraction_dir) peak_df: pd.DataFrame = extraction.library_matching( - image_xr=image_xr, library_peak_df=library, ppm=_ppm, extraction_dir=extraction_dir + library_peak_df=library, ppm=_ppm, extraction_dir=extraction_dir ) for idx, row in enumerate(peak_df.itertuples()): diff --git a/tests/plotting_test.py b/tests/plotting_test.py index e6ea52f..7fcb0ee 100644 --- a/tests/plotting_test.py +++ b/tests/plotting_test.py @@ -85,11 +85,12 @@ def test_save_peak_images(image_xr: xr.DataArray, tmp_path: pathlib.Path): def test_save_matched_peak_images(rng: np.random.Generator, image_xr: xr.DataArray, tmp_path: pathlib.Path): extraction_dir = tmp_path / "extraction_dir" extraction_dir.mkdir(parents=True, exist_ok=True) + plotting.save_peak_images(image_xr, extraction_dir) matched_peaks_df = pd.DataFrame(data={"composition": rng.random(size=(3,))}) plotting.save_matched_peak_images( - image_xr=image_xr, matched_peaks_df=matched_peaks_df, extraction_dir=extraction_dir + matched_peaks_df=matched_peaks_df, extraction_dir=extraction_dir ) for peak in matched_peaks_df.itertuples(): From 0b0975f92d29e446a665f1ba0564ccacd1f71522 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Tue, 12 Nov 2024 15:51:59 -0800 Subject: [PATCH 05/16] Fix more tests --- tests/extraction_test.py | 8 ++++---- tests/plotting_test.py | 4 +--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/extraction_test.py b/tests/extraction_test.py index fe92ba5..c8dadd5 100644 --- a/tests/extraction_test.py +++ b/tests/extraction_test.py @@ -103,9 +103,7 @@ def test_peak_spectra( def test_coordinate_integration(imz_data, peak_widths, tmp_path: pathlib.Path): peak_df, *_ = peak_widths extraction_dir = tmp_path / "extraction_dir" - extraction.coordinate_integration( - peak_df=peak_df, imz_data=imz_data, extraction_dir=extraction_dir - ) + extraction.coordinate_integration(peak_df=peak_df, imz_data=imz_data, extraction_dir=extraction_dir) # Make sure the shape of any given image is correct. test_peak_img = list_files(extraction_dir)[0] @@ -129,7 +127,9 @@ def test__matching_vec(library: pd.DataFrame, obs_mz: int, true_values: pd.Serie @pytest.mark.parametrize(argnames="_ppm", argvalues=[99]) -def test_library_matching(library: pd.DataFrame, _ppm: int, tmp_path: pathlib.Path): +def test_library_matching( + library: pd.DataFrame, _ppm: image_xr: xr.DataArray, int, tmp_path: pathlib.Path +): extraction_dir = tmp_path / "extraction_dir" extraction_dir.mkdir(parents=True, exist_ok=True) plotting.save_peak_images(image_xr, extraction_dir) diff --git a/tests/plotting_test.py b/tests/plotting_test.py index 7fcb0ee..2893b77 100644 --- a/tests/plotting_test.py +++ b/tests/plotting_test.py @@ -89,9 +89,7 @@ def test_save_matched_peak_images(rng: np.random.Generator, image_xr: xr.DataArr matched_peaks_df = pd.DataFrame(data={"composition": rng.random(size=(3,))}) - plotting.save_matched_peak_images( - matched_peaks_df=matched_peaks_df, extraction_dir=extraction_dir - ) + plotting.save_matched_peak_images(matched_peaks_df=matched_peaks_df, extraction_dir=extraction_dir) for peak in matched_peaks_df.itertuples(): # Assert that the float and integer images are saved. From dec2c208a33f987a04ff7da5f41c0aa8163302f2 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Wed, 13 Nov 2024 09:48:12 -0800 Subject: [PATCH 06/16] Fix test syntax --- tests/extraction_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/extraction_test.py b/tests/extraction_test.py index c8dadd5..7d0f5a2 100644 --- a/tests/extraction_test.py +++ b/tests/extraction_test.py @@ -128,7 +128,7 @@ def test__matching_vec(library: pd.DataFrame, obs_mz: int, true_values: pd.Serie @pytest.mark.parametrize(argnames="_ppm", argvalues=[99]) def test_library_matching( - library: pd.DataFrame, _ppm: image_xr: xr.DataArray, int, tmp_path: pathlib.Path + library: pd.DataFrame, image_xr: xr.DataArray, _ppm: int, tmp_path: pathlib.Path ): extraction_dir = tmp_path / "extraction_dir" extraction_dir.mkdir(parents=True, exist_ok=True) From 0fec7436521004ac58df2437d36fb0bf48658ba9 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Wed, 13 Nov 2024 10:03:13 -0800 Subject: [PATCH 07/16] Formatting of extraction test --- tests/extraction_test.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/extraction_test.py b/tests/extraction_test.py index 7d0f5a2..be365cc 100644 --- a/tests/extraction_test.py +++ b/tests/extraction_test.py @@ -127,9 +127,7 @@ def test__matching_vec(library: pd.DataFrame, obs_mz: int, true_values: pd.Serie @pytest.mark.parametrize(argnames="_ppm", argvalues=[99]) -def test_library_matching( - library: pd.DataFrame, image_xr: xr.DataArray, _ppm: int, tmp_path: pathlib.Path -): +def test_library_matching(library: pd.DataFrame, image_xr: xr.DataArray, _ppm: int, tmp_path: pathlib.Path): extraction_dir = tmp_path / "extraction_dir" extraction_dir.mkdir(parents=True, exist_ok=True) plotting.save_peak_images(image_xr, extraction_dir) From 29c2b230c97cfb824257ba98dfabbeae7ee4adf7 Mon Sep 17 00:00:00 2001 From: Alex Kong Date: Wed, 20 Nov 2024 12:33:24 -0800 Subject: [PATCH 08/16] Update coordinate integration test to verify actual peak finding and saving --- conftest.py | 55 +++++++++++++++++++++++++++++++---- src/maldi_tools/extraction.py | 9 ++++-- tests/extraction_test.py | 45 ++++++++++++++++++---------- 3 files changed, 86 insertions(+), 23 deletions(-) diff --git a/conftest.py b/conftest.py index 484208c..8a94385 100644 --- a/conftest.py +++ b/conftest.py @@ -3,7 +3,7 @@ import json import os from pathlib import Path -from typing import Generator, List +from typing import Generator, List, Tuple import numpy as np import pandas as pd @@ -53,11 +53,44 @@ def imz_data(tmp_path_factory: TempPathFactory, rng: np.random.Generator) -> Imz yield ImzMLParser(filename=output_file_name) +@pytest.fixture(scope="session") +def imz_data_coord_int(tmp_path_factory: TempPathFactory, rng: np.random.Generator) -> ImzMLParser: + # Simplify the previous process for a single coordinate image (1x1) + img_dim: int = 1 + + # Generate random integers n for each coordinate (1 x 1). These will be used for creating + # random m/z and intensity values of length n. + # Lengths n are distributed along the standard gamma. + ns: np.ndarray = np.rint(rng.standard_gamma(shape=2.5, size=(img_dim**2)) * 100).astype(int) + + # Generate random masses and sample different amounts of them, so we get duplicates + total_mzs: np.ndarray = (10000 - 100) * rng.random(size=img_dim**2 * 2) + 100 + + coords = [(x, y, 1) for x in range(1, img_dim + 1) for y in range(1, img_dim + 1)] + + output_file_name: Path = tmp_path_factory.mktemp("data") / "test_data.imzML" + + with ImzMLWriter(output_filename=output_file_name, mode="processed") as imzml: + for coord, n in zip(coords, ns): + # Masses: 100 <= mz < 10000, of length n, sampled randomly + mzs = rng.choice(a=total_mzs, size=n) + + # Intensities: 0 <= int < 1e8, of length n + ints: np.ndarray = rng.exponential(size=n) + + imzml.addSpectrum(mzs=mzs, intensities=ints, coords=coord) + + yield ImzMLParser(filename=output_file_name) + + @pytest.fixture(scope="session") def total_mass_df(rng: np.random.Generator) -> pd.DataFrame: mz_count: int = 10000 df = pd.DataFrame( - data={"m/z": np.linspace(start=1, stop=101, num=mz_count), "intensity": rng.random(size=mz_count)} + data={ + "m/z": np.linspace(start=1, stop=101, num=mz_count), + "intensity": rng.random(size=mz_count) + } ) yield df @@ -75,8 +108,8 @@ def percentile_intensities( @pytest.fixture(scope="session") def peak_idx_candidates( - total_mass_df: pd.DataFrame, percentile_intensities: tuple[np.ndarray, np.ndarray] -) -> Generator[tuple[np.ndarray, np.ndarray], None, None]: + total_mass_df: pd.DataFrame, percentile_intensities: Tuple[np.ndarray, np.ndarray] +) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]: _, log_int_percentile = percentile_intensities peak_candidate_indexes, peak_candidates = extraction.signal_extraction( @@ -87,8 +120,8 @@ def peak_idx_candidates( @pytest.fixture(scope="session") def peak_widths( - total_mass_df, peak_idx_candidates -) -> Generator[tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray], None, None]: + total_mass_df: pd.DataFrame, peak_idx_candidates: Tuple[np.ndarray, np.ndarray] +) -> Generator[Tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray], None, None]: peak_candidate_idxs, peak_candidates = peak_idx_candidates peak_df, l_ips_r, r_ips_r, peak_widths_height = extraction.get_peak_widths( total_mass_df=total_mass_df, @@ -100,6 +133,16 @@ def peak_widths( yield (peak_df, l_ips_r, r_ips_r, peak_widths_height) +@pytest.fixture(scope="session") +def peak_widths_coord_int(imz_data_coord_int: ImzMLParser): + mzs, intensities = imz_data_coord_int.getspectrum(0) + peak_df = pd.DataFrame({"m/z": mzs, "intensity": intensities}) + peak_df["peak"] = (peak_df["m/z"] + 0.04).copy() + peak_df["peak_height"] = 0.001 + + yield peak_df + + @pytest.fixture(scope="session") def library() -> Generator[pd.DataFrame, None, None]: lib = pd.DataFrame(data={"mz": [30, 45], "composition": ["A", "B"]}) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index 14dbef4..7e73f5c 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -254,7 +254,7 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extract extraction_dir (Path): The directory to save extracted data (peak images) in. """ unique_peaks = peak_df["peak"].unique() - dict(zip(unique_peaks, np.arange((len(unique_peaks))))) + peak_dict = dict(zip(unique_peaks, np.arange((len(unique_peaks))))) imz_coordinates: list = imz_data.coordinates @@ -267,11 +267,16 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extract os.makedirs(extraction_dir / "int") for idx, (x, y, _) in tqdm(enumerate(imz_data.coordinates), total=len(imz_data.coordinates)): - peak_img: np.ndarray = np.zeros((1, *image_shape)) mzs, intensities = imz_data.getspectrum(idx) intensity: np.ndarray = intensities[np.isin(mzs, peak_df["m/z"])] for i_idx, peak in peak_df.loc[peak_df["m/z"].isin(mzs), "peak"].reset_index(drop=True).items(): + img_name: str = f"{peak:.4f}".replace(".", "_") + if os.path.exists(extraction_dir / "float" / f"{img_name}.tiff"): + peak_img = imread(extraction_dir / "float" / f"{img_name}.tiff") + else: + peak_img: np.ndarray = np.zeros(image_shape) + peak_img[x - 1, y - 1] += intensity[i_idx] peak_img_float: np.ndarray = peak_img.T peak_img_int: np.ndarray = (peak_img_float * (2**32 - 1) / np.max(peak_img_float)).astype( diff --git a/tests/extraction_test.py b/tests/extraction_test.py index be365cc..d30a033 100644 --- a/tests/extraction_test.py +++ b/tests/extraction_test.py @@ -2,13 +2,13 @@ import os import pathlib -from typing import List +from typing import List, Tuple import numpy as np import pandas as pd import pytest import xarray as xr -from alpineer.io_utils import list_files, remove_file_extensions +from alpineer.io_utils import list_files, list_folders, remove_file_extensions from alpineer.misc_utils import verify_same_elements from pyimzml.ImzMLParser import ImzMLParser from pytest import TempPathFactory @@ -41,7 +41,7 @@ def test_rolling_window(total_mass_df: pd.DataFrame) -> None: def test_signal_extraction( - total_mass_df: pd.DataFrame, percentile_intensities: tuple[np.ndarray, np.ndarray] + total_mass_df: pd.DataFrame, percentile_intensities: Tuple[np.ndarray, np.ndarray] ) -> None: _, log_int_percentile = percentile_intensities peak_candidate_indexes, peak_candidates = extraction.signal_extraction( @@ -56,7 +56,7 @@ def test_signal_extraction( assert np.all(peak_candidates[1:] >= peak_candidates[:-1]) -def test_get_peak_widths(total_mass_df: pd.DataFrame, peak_idx_candidates: tuple[np.ndarray, np.ndarray]): +def test_get_peak_widths(total_mass_df: pd.DataFrame, peak_idx_candidates: Tuple[np.ndarray, np.ndarray]): peak_candidate_idxs, peak_candidates = peak_idx_candidates peak_df, l_ips_r, r_ips_r, peak_widths_height = extraction.get_peak_widths( total_mass_df=total_mass_df, @@ -73,8 +73,8 @@ def test_get_peak_widths(total_mass_df: pd.DataFrame, peak_idx_candidates: tuple def test_peak_spectra( total_mass_df: pd.DataFrame, - peak_idx_candidates: tuple[np.ndarray, np.ndarray], - peak_widths: tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray], + peak_idx_candidates: Tuple[np.ndarray, np.ndarray], + peak_widths: Tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray], tmp_path: pathlib.Path, ): debug_dir = tmp_path / "debug_dir" @@ -100,15 +100,29 @@ def test_peak_spectra( assert os.path.exists(debug_dir / f"{peak.peak:.4f}.png".replace(".", "_", 1)) -def test_coordinate_integration(imz_data, peak_widths, tmp_path: pathlib.Path): - peak_df, *_ = peak_widths +def test_coordinate_integration( + imz_data_coord_int: ImzMLParser, + peak_widths_coord_int: pd.DataFrame, + image_xr: xr.DataArray, + tmp_path: pathlib.Path +): + # peak_df, *_ = peak_widths extraction_dir = tmp_path / "extraction_dir" - extraction.coordinate_integration(peak_df=peak_df, imz_data=imz_data, extraction_dir=extraction_dir) - # Make sure the shape of any given image is correct. - test_peak_img = list_files(extraction_dir)[0] - img_data = imread(extraction_dir / test_peak_img) - assert img_data.shape[1:] == (10, 10) + extraction.coordinate_integration( + peak_df=peak_widths_coord_int, + imz_data=imz_data_coord_int, + extraction_dir=extraction_dir + ) + + # Make sure the shape of any given image is correct for both float and int + test_float_peak_img = list_files(extraction_dir / "float")[0] + float_img_data = imread(extraction_dir / "float" / test_float_peak_img) + assert float_img_data.shape == (1, 1) + + test_int_peak_img = list_files(extraction_dir / "int")[0] + int_img_data = imread(extraction_dir / "int" / test_int_peak_img) + assert int_img_data.shape == (1, 1) @pytest.mark.parametrize( @@ -137,15 +151,16 @@ def test_library_matching(library: pd.DataFrame, image_xr: xr.DataArray, _ppm: i ) for idx, row in enumerate(peak_df.itertuples()): - if idx < 4: + if row.peak not in {30.0, 45.0}: assert row.matched is False assert np.isnan(row.composition) assert np.isnan(row.mass_error) assert np.isnan(row.lib_mz) else: + assert row.matched is True assert row.mass_error == 0 assert row.composition in {"A", "B"} - assert row.peak in {30, 45} + assert row.lib_mz in {30.0, 45.0} def test_generate_glycan_mask( From a79c0d871ede665a16a02f508a88eaa42e6baa71 Mon Sep 17 00:00:00 2001 From: Alex Kong Date: Wed, 20 Nov 2024 12:52:21 -0800 Subject: [PATCH 09/16] Ensure float image gets re-transposed if loading back --- conftest.py | 5 +---- src/maldi_tools/extraction.py | 16 ++++++---------- tests/extraction_test.py | 8 +++----- 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/conftest.py b/conftest.py index 8a94385..fa50c69 100644 --- a/conftest.py +++ b/conftest.py @@ -87,10 +87,7 @@ def imz_data_coord_int(tmp_path_factory: TempPathFactory, rng: np.random.Generat def total_mass_df(rng: np.random.Generator) -> pd.DataFrame: mz_count: int = 10000 df = pd.DataFrame( - data={ - "m/z": np.linspace(start=1, stop=101, num=mz_count), - "intensity": rng.random(size=mz_count) - } + data={"m/z": np.linspace(start=1, stop=101, num=mz_count), "intensity": rng.random(size=mz_count)} ) yield df diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index 7e73f5c..e849f89 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -253,9 +253,6 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extract imz_data (ImzMLParser): The imzML object. extraction_dir (Path): The directory to save extracted data (peak images) in. """ - unique_peaks = peak_df["peak"].unique() - peak_dict = dict(zip(unique_peaks, np.arange((len(unique_peaks))))) - imz_coordinates: list = imz_data.coordinates x_size: int = max(imz_coordinates, key=itemgetter(0))[0] @@ -272,19 +269,18 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extract for i_idx, peak in peak_df.loc[peak_df["m/z"].isin(mzs), "peak"].reset_index(drop=True).items(): img_name: str = f"{peak:.4f}".replace(".", "_") - if os.path.exists(extraction_dir / "float" / f"{img_name}.tiff"): - peak_img = imread(extraction_dir / "float" / f"{img_name}.tiff") - else: - peak_img: np.ndarray = np.zeros(image_shape) + float_peak_path: Path = extraction_dir / "float" / f"{img_name}.tiff" + int_peak_path: Path = extraction_dir / "int" / f"{img_name}.tiff" + peak_exists: bool = os.path.exists(float_peak_path) + peak_img: np.ndarray = imread(float_peak_path).T if peak_exists else np.zeros(image_shape) peak_img[x - 1, y - 1] += intensity[i_idx] peak_img_float: np.ndarray = peak_img.T peak_img_int: np.ndarray = (peak_img_float * (2**32 - 1) / np.max(peak_img_float)).astype( np.uint32 ) - img_name: str = f"{peak:.4f}".replace(".", "_") - save_image(fname=extraction_dir / "float" / f"{img_name}.tiff", data=peak_img_float) - save_image(fname=extraction_dir / "int" / f"{img_name}.tiff", data=peak_img_int) + save_image(fname=float_peak_path, data=peak_img_float) + save_image(fname=int_peak_path, data=peak_img_int) def _matching_vec(obs_mz: pd.Series, library_peak_df: pd.DataFrame, ppm: int) -> pd.Series: diff --git a/tests/extraction_test.py b/tests/extraction_test.py index d30a033..e703a28 100644 --- a/tests/extraction_test.py +++ b/tests/extraction_test.py @@ -8,7 +8,7 @@ import pandas as pd import pytest import xarray as xr -from alpineer.io_utils import list_files, list_folders, remove_file_extensions +from alpineer.io_utils import list_files, remove_file_extensions from alpineer.misc_utils import verify_same_elements from pyimzml.ImzMLParser import ImzMLParser from pytest import TempPathFactory @@ -104,15 +104,13 @@ def test_coordinate_integration( imz_data_coord_int: ImzMLParser, peak_widths_coord_int: pd.DataFrame, image_xr: xr.DataArray, - tmp_path: pathlib.Path + tmp_path: pathlib.Path, ): # peak_df, *_ = peak_widths extraction_dir = tmp_path / "extraction_dir" extraction.coordinate_integration( - peak_df=peak_widths_coord_int, - imz_data=imz_data_coord_int, - extraction_dir=extraction_dir + peak_df=peak_widths_coord_int, imz_data=imz_data_coord_int, extraction_dir=extraction_dir ) # Make sure the shape of any given image is correct for both float and int From 2e73b8d513b55e5886731410d21b7844fa800484 Mon Sep 17 00:00:00 2001 From: Alex Kong Date: Wed, 20 Nov 2024 14:59:52 -0800 Subject: [PATCH 10/16] Add more robust testing for library matching and saving out matched peak images --- src/maldi_tools/plotting.py | 22 ++++++++++----------- tests/plotting_test.py | 38 +++++++++++++++++++++++++++++++++---- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/src/maldi_tools/plotting.py b/src/maldi_tools/plotting.py index d6ef36a..7dee2ec 100644 --- a/src/maldi_tools/plotting.py +++ b/src/maldi_tools/plotting.py @@ -193,18 +193,16 @@ def save_matched_peak_images( matched_peaks_df_filtered: pd.DataFrame = matched_peaks_df.dropna() for row in tqdm(matched_peaks_df_filtered.itertuples(), total=len(matched_peaks_df_filtered)): - # load in the corresponding float and integer images - float_img: np.ndarray = io.imread( - extraction_dir / "float" / f"{str(row.lib_mz).replace('.', '_')}.tiff" - ) - integer_img: np.ndarray = io.imread( - extraction_dir / "int" / f"{str(row.lib_mz).replace('.', '_')}.tiff" - ) + if row.matched is True: + peak_file_name: str = f"{row.lib_mz:.4f}".replace(".", "_") + ".tiff" + # load in the corresponding float and integer images + float_img: np.ndarray = io.imread(extraction_dir / "float" / peak_file_name) + integer_img: np.ndarray = io.imread(extraction_dir / "int" / peak_file_name) - img_name: str = row.composition + img_name: str = row.composition - # save floating point image - image_utils.save_image(fname=float_dir / f"{img_name}.tiff", data=float_img) + # save floating point image + image_utils.save_image(fname=float_dir / f"{img_name}.tiff", data=float_img) - # save integer image - image_utils.save_image(fname=int_dir / f"{img_name}.tiff", data=integer_img) + # save integer image + image_utils.save_image(fname=int_dir / f"{img_name}.tiff", data=integer_img) diff --git a/tests/plotting_test.py b/tests/plotting_test.py index 2893b77..279f37f 100644 --- a/tests/plotting_test.py +++ b/tests/plotting_test.py @@ -87,11 +87,41 @@ def test_save_matched_peak_images(rng: np.random.Generator, image_xr: xr.DataArr extraction_dir.mkdir(parents=True, exist_ok=True) plotting.save_peak_images(image_xr, extraction_dir) - matched_peaks_df = pd.DataFrame(data={"composition": rng.random(size=(3,))}) + peaks = image_xr.peak.values + img_shape = (image_xr.shape[1], image_xr.shape[2]) + matched = [False] * len(peaks) + matched[-1] = True + composition = [np.nan] * len(peaks) + composition[-1] = rng.random(size=(1,)) + mass_error = [np.nan] * len(peaks) + mass_error[-1] = rng.random(size=(1,)) + matched_peaks_df = pd.DataFrame( + data={ + "lib_mz": peaks, + "matched": matched, + "composition": composition, + "mass_error": mass_error, + } + ) plotting.save_matched_peak_images(matched_peaks_df=matched_peaks_df, extraction_dir=extraction_dir) for peak in matched_peaks_df.itertuples(): - # Assert that the float and integer images are saved. - assert os.path.exists(extraction_dir / "library_matched" / "float" / f"{peak.composition}.tiff") - assert os.path.exists(extraction_dir / "library_matched" / "int" / f"{peak.composition}.tiff") + float_peak_path = extraction_dir / "library_matched" / "float" / f"{peak.composition}.tiff" + int_peak_path = extraction_dir / "library_matched" / "int" / f"{peak.composition}.tiff" + + # Assert that the float and integer images are saved for all matched peaks. + # Check that the peak images match the desired shape + if peak.matched is True: + assert os.path.exists(float_peak_path) + assert os.path.exists(int_peak_path) + + matched_float_img = imread(float_peak_path) + matched_int_img = imread(int_peak_path) + + assert matched_float_img.shape == img_shape + assert matched_int_img.shape == img_shape + # Otherwise, ensure peaks are not saved + else: + assert not os.path.exists(float_peak_path) + assert not os.path.exists(int_peak_path) From 31de1a23458cd67f9d2ce1e3d9b91de7dc0cf2ea Mon Sep 17 00:00:00 2001 From: Alex Kong Date: Wed, 20 Nov 2024 15:05:08 -0800 Subject: [PATCH 11/16] Update documentation and function call to coordinate_integration to match changes to workflow --- templates/maldi-pipeline.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb index 31be7e0..27e2318 100644 --- a/templates/maldi-pipeline.ipynb +++ b/templates/maldi-pipeline.ipynb @@ -433,7 +433,7 @@ "source": [ "## Integrate Coordinates\n", "\n", - "Generate the images and save them in an *xarray*, where the dimensions are: Image (indexed by peak value), $x$, and $y$." + "Generate the images and save them as TIFFs in `extraction_dir`. Each file is named after their corresponding peak m/z value, truncated to 4 decimal places. The dimensions of each image correspond to the maximum x- and y-coordinates extracted from the slide." ] }, { @@ -444,7 +444,7 @@ }, "outputs": [], "source": [ - "extraction.coordinate_integration(peak_df=peak_df, imz_data=imz_data)" + "extraction.coordinate_integration(peak_df=peak_df, imz_data=imz_data, extraction_dir=extraction_dir)" ] }, { @@ -490,7 +490,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Constants" + "Set a value for the maximum ppm tolerance between a peak and its corresponding match in the `library_peak_df` specified. Matched peak images are saved as TIFFs to the `library_matched` subfolder inside `extraction_dir` and are named after their matched peak m/z value." ] }, { @@ -677,7 +677,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.6" + "version": "3.11.10" }, "vscode": { "interpreter": { From 873fed59bf40c34cab7b13cae25bb06a09aafde7 Mon Sep 17 00:00:00 2001 From: Alex Kong Date: Tue, 26 Nov 2024 10:23:58 -0800 Subject: [PATCH 12/16] Ensure extraction_dir is absolutely converted to a Path type --- src/maldi_tools/extraction.py | 10 ++++++---- src/maldi_tools/plotting.py | 8 ++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index e849f89..b5c426e 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -260,8 +260,10 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extract image_shape: Tuple[int, int] = (x_size, y_size) - os.makedirs(extraction_dir / "float") - os.makedirs(extraction_dir / "int") + float_peak_dir: Path = Path(extraction_dir) / "float" + int_peak_dir: Path = Path(extraction_dir) / "int" + os.makedirs(float_peak_dir) + os.makedirs(int_peak_dir) for idx, (x, y, _) in tqdm(enumerate(imz_data.coordinates), total=len(imz_data.coordinates)): mzs, intensities = imz_data.getspectrum(idx) @@ -269,8 +271,8 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extract for i_idx, peak in peak_df.loc[peak_df["m/z"].isin(mzs), "peak"].reset_index(drop=True).items(): img_name: str = f"{peak:.4f}".replace(".", "_") - float_peak_path: Path = extraction_dir / "float" / f"{img_name}.tiff" - int_peak_path: Path = extraction_dir / "int" / f"{img_name}.tiff" + float_peak_path: Path = float_peak_dir / f"{img_name}.tiff" + int_peak_path: Path = int_peak_dir / f"{img_name}.tiff" peak_exists: bool = os.path.exists(float_peak_path) peak_img: np.ndarray = imread(float_peak_path).T if peak_exists else np.zeros(image_shape) diff --git a/src/maldi_tools/plotting.py b/src/maldi_tools/plotting.py index 7dee2ec..07706ea 100644 --- a/src/maldi_tools/plotting.py +++ b/src/maldi_tools/plotting.py @@ -184,8 +184,8 @@ def save_matched_peak_images( extraction_dir (Path): The directory to save extracted data in. """ # Create image directories if they do not exist - float_dir: Path = extraction_dir / "library_matched" / "float" - int_dir: Path = extraction_dir / "library_matched" / "int" + float_dir: Path = Path(extraction_dir) / "library_matched" / "float" + int_dir: Path = Path(extraction_dir) / "library_matched" / "int" for img_dir in [float_dir, int_dir]: if not os.path.exists(img_dir): img_dir.mkdir(parents=True, exist_ok=True) @@ -196,8 +196,8 @@ def save_matched_peak_images( if row.matched is True: peak_file_name: str = f"{row.lib_mz:.4f}".replace(".", "_") + ".tiff" # load in the corresponding float and integer images - float_img: np.ndarray = io.imread(extraction_dir / "float" / peak_file_name) - integer_img: np.ndarray = io.imread(extraction_dir / "int" / peak_file_name) + float_img: np.ndarray = io.imread(Path(extraction_dir) / "float" / peak_file_name) + integer_img: np.ndarray = io.imread(Path(extraction_dir) / "int" / peak_file_name) img_name: str = row.composition From d240478c1cecee14dede4f32296312814b0f458b Mon Sep 17 00:00:00 2001 From: Alex Kong Date: Tue, 26 Nov 2024 10:41:10 -0800 Subject: [PATCH 13/16] Ensure peaks can be visualized properly --- src/maldi_tools/plotting.py | 10 +++++++--- tests/plotting_test.py | 18 ++++++++++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/src/maldi_tools/plotting.py b/src/maldi_tools/plotting.py index 07706ea..51bd5dc 100644 --- a/src/maldi_tools/plotting.py +++ b/src/maldi_tools/plotting.py @@ -163,13 +163,17 @@ def plot_peak_hist(peak: float, bin_count: int, extraction_dir: Path) -> None: extraction_dir (Path): The directory the peak images are saved in """ # verify that the peak provided exists - peak_path = extraction_dir / f"{str(peak).replace('.', '_')}.tiff" + peak_file: str = f"{peak:.4f}".replace(".", "_") + peak_file = peak_file + ".tiff" + peak_path = Path(extraction_dir) / "float" / peak_file if not os.path.exists(peak_path): - raise FileNotFoundError(f"Peak {peak} does not have a corresponding peak image in {extraction_dir}") + raise FileNotFoundError( + f"Peak {peak:.4f} does not have a corresponding peak image in {extraction_dir}" + ) # load the peak image in and display histogram peak_img: np.ndarray = io.imread(peak_path) - plt.hist(peak_img.values, bins=bin_count) + plt.hist(peak_img, bins=bin_count) def save_matched_peak_images( diff --git a/tests/plotting_test.py b/tests/plotting_test.py index 279f37f..7ce0a64 100644 --- a/tests/plotting_test.py +++ b/tests/plotting_test.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd +import pytest import xarray as xr from skimage.io import imread @@ -82,6 +83,23 @@ def test_save_peak_images(image_xr: xr.DataArray, tmp_path: pathlib.Path): assert os.path.exists(iname) +def test_plot_peak_hist(image_xr: xr.DataArray, tmp_path: pathlib.Path): + extraction_dir = tmp_path / "extraction_dir" + extraction_dir.mkdir(parents=True, exist_ok=True) + + # ensure the test actually truncates to 4 digits correctly + image_xr = image_xr.assign_coords(peak=np.random.rand(6) * 100) + + plotting.save_peak_images(image_xr=image_xr, extraction_dir=extraction_dir) + + # this test should run to completion, since the peak can be loaded + plotting.plot_peak_hist(peak=image_xr.peak.values[0], bin_count=30, extraction_dir=extraction_dir) + + # this test should fail since the peak does not exist + with pytest.raises(FileNotFoundError): + plotting.plot_peak_hist(peak=50.0123, bin_count=30, extraction_dir=extraction_dir) + + def test_save_matched_peak_images(rng: np.random.Generator, image_xr: xr.DataArray, tmp_path: pathlib.Path): extraction_dir = tmp_path / "extraction_dir" extraction_dir.mkdir(parents=True, exist_ok=True) From bcd217a72103eb35788261681062b613b2ef95a6 Mon Sep 17 00:00:00 2001 From: Alex Kong Date: Tue, 26 Nov 2024 10:43:46 -0800 Subject: [PATCH 14/16] Ensure no failures if coordinate_integration run multiple times --- src/maldi_tools/extraction.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index b5c426e..47cea7c 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -262,8 +262,10 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extract float_peak_dir: Path = Path(extraction_dir) / "float" int_peak_dir: Path = Path(extraction_dir) / "int" - os.makedirs(float_peak_dir) - os.makedirs(int_peak_dir) + if not os.path.exists(float_peak_dir): + os.makedirs(float_peak_dir) + if not os.path.exists(int_peak_dir): + os.makedirs(int_peak_dir) for idx, (x, y, _) in tqdm(enumerate(imz_data.coordinates), total=len(imz_data.coordinates)): mzs, intensities = imz_data.getspectrum(idx) From 42490353fe7700942e4c2ea8f1ec352739ac7bc0 Mon Sep 17 00:00:00 2001 From: Alex Kong Date: Tue, 26 Nov 2024 10:44:31 -0800 Subject: [PATCH 15/16] Bring directory creation scheme in line with save_matched_peak_images --- src/maldi_tools/extraction.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index 47cea7c..f4d320a 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -262,10 +262,9 @@ def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extract float_peak_dir: Path = Path(extraction_dir) / "float" int_peak_dir: Path = Path(extraction_dir) / "int" - if not os.path.exists(float_peak_dir): - os.makedirs(float_peak_dir) - if not os.path.exists(int_peak_dir): - os.makedirs(int_peak_dir) + for img_dir in [float_peak_dir, int_peak_dir]: + if not os.path.exists(img_dir): + img_dir.mkdir(parents=True, exist_ok=True) for idx, (x, y, _) in tqdm(enumerate(imz_data.coordinates), total=len(imz_data.coordinates)): mzs, intensities = imz_data.getspectrum(idx) From 25074c421e4d6a8ced2cc97c0862f1064fc75fe9 Mon Sep 17 00:00:00 2001 From: Alex Kong Date: Tue, 26 Nov 2024 10:59:37 -0800 Subject: [PATCH 16/16] Make sure library matching loading uses Path correctly --- src/maldi_tools/extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index f4d320a..8b65910 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -337,7 +337,7 @@ def library_matching( or not, the composition name and the mass error if a match was found or not. """ peak_list: List[float] = [ - float(p.replace("_", ".")) for p in remove_file_extensions(list_files(extraction_dir / "float")) + float(p.replace("_", ".")) for p in remove_file_extensions(list_files(Path(extraction_dir) / "float")) ] peak_df = pd.DataFrame({"peak": np.array(peak_list)}) match_fun = partial(_matching_vec, library_peak_df=library_peak_df, ppm=ppm)