angelolab · alex-l-kong · Aug 29, 2024 · Nov 12, 2024 · Nov 12, 2024 · Nov 12, 2024
diff --git a/conftest.py b/conftest.py
@@ -3,7 +3,7 @@
 import json
 import os
 from pathlib import Path
-from typing import Generator, List
+from typing import Generator, List, Tuple
 
 import numpy as np
 import pandas as pd
@@ -53,6 +53,36 @@ def imz_data(tmp_path_factory: TempPathFactory, rng: np.random.Generator) -> Imz
     yield ImzMLParser(filename=output_file_name)
 
 
+@pytest.fixture(scope="session")
+def imz_data_coord_int(tmp_path_factory: TempPathFactory, rng: np.random.Generator) -> ImzMLParser:
+    # Simplify the previous process for a single coordinate image (1x1)
+    img_dim: int = 1
+
+    # Generate random integers n for each coordinate (1 x 1). These will be used for creating
+    # random m/z and intensity values of length n.
+    # Lengths n are distributed along the standard gamma.
+    ns: np.ndarray = np.rint(rng.standard_gamma(shape=2.5, size=(img_dim**2)) * 100).astype(int)
+
+    # Generate random masses and sample different amounts of them, so we get duplicates
+    total_mzs: np.ndarray = (10000 - 100) * rng.random(size=img_dim**2 * 2) + 100
+
+    coords = [(x, y, 1) for x in range(1, img_dim + 1) for y in range(1, img_dim + 1)]
+
+    output_file_name: Path = tmp_path_factory.mktemp("data") / "test_data.imzML"
+
+    with ImzMLWriter(output_filename=output_file_name, mode="processed") as imzml:
+        for coord, n in zip(coords, ns):
+            # Masses: 100 <= mz < 10000, of length n, sampled randomly
+            mzs = rng.choice(a=total_mzs, size=n)
+
+            # Intensities: 0 <= int < 1e8, of length n
+            ints: np.ndarray = rng.exponential(size=n)
+
+            imzml.addSpectrum(mzs=mzs, intensities=ints, coords=coord)
+
+    yield ImzMLParser(filename=output_file_name)
+
+
 @pytest.fixture(scope="session")
 def total_mass_df(rng: np.random.Generator) -> pd.DataFrame:
     mz_count: int = 10000
@@ -75,8 +105,8 @@ def percentile_intensities(
 
 @pytest.fixture(scope="session")
 def peak_idx_candidates(
-    total_mass_df: pd.DataFrame, percentile_intensities: tuple[np.ndarray, np.ndarray]
-) -> Generator[tuple[np.ndarray, np.ndarray], None, None]:
+    total_mass_df: pd.DataFrame, percentile_intensities: Tuple[np.ndarray, np.ndarray]
+) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]:
     _, log_int_percentile = percentile_intensities
 
     peak_candidate_indexes, peak_candidates = extraction.signal_extraction(
@@ -87,8 +117,8 @@ def peak_idx_candidates(
 
 @pytest.fixture(scope="session")
 def peak_widths(
-    total_mass_df, peak_idx_candidates
-) -> Generator[tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray], None, None]:
+    total_mass_df: pd.DataFrame, peak_idx_candidates: Tuple[np.ndarray, np.ndarray]
+) -> Generator[Tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray], None, None]:
     peak_candidate_idxs, peak_candidates = peak_idx_candidates
     peak_df, l_ips_r, r_ips_r, peak_widths_height = extraction.get_peak_widths(
         total_mass_df=total_mass_df,
@@ -100,6 +130,16 @@ def peak_widths(
     yield (peak_df, l_ips_r, r_ips_r, peak_widths_height)
 
 
+@pytest.fixture(scope="session")
+def peak_widths_coord_int(imz_data_coord_int: ImzMLParser):
+    mzs, intensities = imz_data_coord_int.getspectrum(0)
+    peak_df = pd.DataFrame({"m/z": mzs, "intensity": intensities})
+    peak_df["peak"] = (peak_df["m/z"] + 0.04).copy()
+    peak_df["peak_height"] = 0.001
+
+    yield peak_df
+
+
 @pytest.fixture(scope="session")
 def library() -> Generator[pd.DataFrame, None, None]:
     lib = pd.DataFrame(data={"mz": [30, 45], "composition": ["A", "B"]})

diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import pandas as pd
-import xarray as xr
+from alpineer.image_utils import save_image
 from alpineer.io_utils import list_files, remove_file_extensions, validate_paths
 from alpineer.misc_utils import verify_in_list
 from pyimzml.ImzMLParser import ImzMLParser
@@ -27,7 +27,7 @@
 from maldi_tools import plotting
 
 
-def extract_spectra(imz_data: ImzMLParser, intensity_percentile: int) -> tuple[pd.DataFrame, np.ndarray]:
+def extract_spectra(imz_data: ImzMLParser, intensity_percentile: int) -> Tuple[pd.DataFrame, np.ndarray]:
     """Iterates over all coordinates after opening the `imzML` data and extracts all masses,
     and sums the intensities for all masses. Creates an intensity image, thresholded on
     `intensity_percentile` with `np.percentile`. The masses are then sorted.
@@ -39,7 +39,7 @@ def extract_spectra(imz_data: ImzMLParser, intensity_percentile: int) -> tuple[p
 
     Returns:
     -------
-        tuple[pd.DataFrame, np.ndarray]: A tuple where the first element is the dataframe containing
+        Tuple[pd.DataFrame, np.ndarray]: A tuple where the first element is the dataframe containing
         the total masses and their intensities, and the second element is the thresholds matrix
         of the image.
     """
@@ -70,20 +70,20 @@ def extract_spectra(imz_data: ImzMLParser, intensity_percentile: int) -> tuple[p
 
 def rolling_window(
     total_mass_df: pd.DataFrame, intensity_percentile: int, window_size: int = 5000
-) -> tuple[np.ndarray, np.ndarray]:
+) -> Tuple[np.ndarray, np.ndarray]:
     """Computes the rolling window log intensities and the rolling window log intensity percentiles.
 
     Args:
     ----
         total_mass_df (pd.DataFrame): A dataframe containing all the masses and their
             relative intensities.
         intensity_percentile (int): The intensity for the quantile calculation.
-        window_size (int, optional): The sizve of the window for the rolling window method.
+        window_size (int): The sizve of the window for the rolling window method.
             Defaults to 5000.
 
     Returns:
     -------
-        tuple[np.ndarray, np.ndarray]: A tuple where the first element is the log intensities,
+        Tuple[np.ndarray, np.ndarray]: A tuple where the first element is the log intensities,
             and the second element is the log intensity percentiles.
     """
     plt_range_min_ind: int = 0
@@ -241,46 +241,49 @@ def peak_spectra(
     return panel_df
 
 
-def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser) -> xr.DataArray:
+def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extraction_dir: Path) -> None:
     """Integrates the coordinates with the discovered, post-processed peaks and generates an image for
     each of the peaks using the imzML coordinate data.
 
+    Saves the peak images to specified extraction_dir.
+
     Args:
     ----
         peak_df (pd.DataFrame): The unique peaks from the data.
         imz_data (ImzMLParser): The imzML object.
-
-    Returns:
-    -------
-        xr.DataArray: A data structure which holds all the images for each peak.
+        extraction_dir (Path): The directory to save extracted data (peak images) in.
     """
-    unique_peaks = peak_df["peak"].unique()
-    peak_dict = dict(zip(unique_peaks, np.arange((len(unique_peaks)))))
-
     imz_coordinates: list = imz_data.coordinates
 
     x_size: int = max(imz_coordinates, key=itemgetter(0))[0]
     y_size: int = max(imz_coordinates, key=itemgetter(1))[1]
 
     image_shape: Tuple[int, int] = (x_size, y_size)
 
-    imgs = np.zeros((len(unique_peaks), *image_shape))
+    float_peak_dir: Path = Path(extraction_dir) / "float"
+    int_peak_dir: Path = Path(extraction_dir) / "int"
+    for img_dir in [float_peak_dir, int_peak_dir]:
+        if not os.path.exists(img_dir):
+            img_dir.mkdir(parents=True, exist_ok=True)
 
     for idx, (x, y, _) in tqdm(enumerate(imz_data.coordinates), total=len(imz_data.coordinates)):
         mzs, intensities = imz_data.getspectrum(idx)
-
         intensity: np.ndarray = intensities[np.isin(mzs, peak_df["m/z"])]
 
         for i_idx, peak in peak_df.loc[peak_df["m/z"].isin(mzs), "peak"].reset_index(drop=True).items():
-            imgs[peak_dict[peak], x - 1, y - 1] += intensity[i_idx]
-
-    img_data = xr.DataArray(
-        data=imgs,
-        coords={"peak": unique_peaks, "x": range(x_size), "y": range(y_size)},
-        dims=["peak", "x", "y"],
-    )
-
-    return img_data
+            img_name: str = f"{peak:.4f}".replace(".", "_")
+            float_peak_path: Path = float_peak_dir / f"{img_name}.tiff"
+            int_peak_path: Path = int_peak_dir / f"{img_name}.tiff"
+            peak_exists: bool = os.path.exists(float_peak_path)
+            peak_img: np.ndarray = imread(float_peak_path).T if peak_exists else np.zeros(image_shape)
+
+            peak_img[x - 1, y - 1] += intensity[i_idx]
+            peak_img_float: np.ndarray = peak_img.T
+            peak_img_int: np.ndarray = (peak_img_float * (2**32 - 1) / np.max(peak_img_float)).astype(
+                np.uint32
+            )
+            save_image(fname=float_peak_path, data=peak_img_float)
+            save_image(fname=int_peak_path, data=peak_img_int)
 
 
 def _matching_vec(obs_mz: pd.Series, library_peak_df: pd.DataFrame, ppm: int) -> pd.Series:
@@ -312,7 +315,6 @@ def _matching_vec(obs_mz: pd.Series, library_peak_df: pd.DataFrame, ppm: int) ->
 
 
 def library_matching(
-    image_xr: xr.DataArray,
     library_peak_df: pd.DataFrame,
     ppm: int,
     extraction_dir: Path,
@@ -323,19 +325,21 @@ def library_matching(
 
     Args:
     ----
-        image_xr (xr.DataArray): A data structure which holds all the images for each peak.
         library_peak_df (pd.DataFrame): The library of interest to match the observed peaks with.
         ppm (int): The ppm for an acceptable mass error range between the observed mass and any target
         mass in the library.
         extraction_dir (Path): The directory to save extracted data in.
-        adducts (bool, optional): Add adducts together. Defaults to False. (Not implemented feature)
+        adducts (bool): Add adducts together. Defaults to False. (Not implemented feature)
 
     Returns:
     -------
         pd.DataFrame: Contains the peak, the library target mass, a boolean stating if a match was found
         or not, the composition name and the mass error if a match was found or not.
     """
-    peak_df = pd.DataFrame({"peak": image_xr.peak.to_numpy()})
+    peak_list: List[float] = [
+        float(p.replace("_", ".")) for p in remove_file_extensions(list_files(Path(extraction_dir) / "float"))
+    ]
+    peak_df = pd.DataFrame({"peak": np.array(peak_list)})
     match_fun = partial(_matching_vec, library_peak_df=library_peak_df, ppm=ppm)
 
     peak_df[["lib_mz", "matched", "composition", "mass_error"]] = peak_df["peak"].apply(

diff --git a/src/maldi_tools/plotting.py b/src/maldi_tools/plotting.py
@@ -11,6 +11,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import skimage.io as io
 import xarray as xr
 from alpineer import image_utils
 from tqdm.notebook import tqdm
@@ -152,38 +153,60 @@ def save_peak_images(image_xr: xr.DataArray, extraction_dir: Path) -> None:
         image_utils.save_image(fname=int_dir / f"{img_name}.tiff", data=integer_img)
 
 
+def plot_peak_hist(peak: float, bin_count: int, extraction_dir: Path) -> None:
+    """Plot a histogram of the intensities of a provided peak image.
+
+    Args:
+    ----
+        peak (float): The desired peak to visualize
+        bin_count (int): The bin size to use for the histogram
+        extraction_dir (Path): The directory the peak images are saved in
+    """
+    # verify that the peak provided exists
+    peak_file: str = f"{peak:.4f}".replace(".", "_")
+    peak_file = peak_file + ".tiff"
+    peak_path = Path(extraction_dir) / "float" / peak_file
+    if not os.path.exists(peak_path):
+        raise FileNotFoundError(
+            f"Peak {peak:.4f} does not have a corresponding peak image in {extraction_dir}"
+        )
+
+    # load the peak image in and display histogram
+    peak_img: np.ndarray = io.imread(peak_path)
+    plt.hist(peak_img, bins=bin_count)
+
+
 def save_matched_peak_images(
-    image_xr: xr.DataArray,
     matched_peaks_df: pd.DataFrame,
     extraction_dir: Path,
 ) -> None:
     """Saves the images which were matched with the library.
 
     Args:
     ----
-        image_xr (xr.DataArray): A data structure which holds all the images for each peak.
         matched_peaks_df (pd.DataFrame): A dataframe containing the peaks matched with the library.
         extraction_dir (Path): The directory to save extracted data in.
     """
     # Create image directories if they do not exist
-    float_dir: Path = extraction_dir / "library_matched" / "float"
-    int_dir: Path = extraction_dir / "library_matched" / "int"
+    float_dir: Path = Path(extraction_dir) / "library_matched" / "float"
+    int_dir: Path = Path(extraction_dir) / "library_matched" / "int"
     for img_dir in [float_dir, int_dir]:
         if not os.path.exists(img_dir):
             img_dir.mkdir(parents=True, exist_ok=True)
 
     matched_peaks_df_filtered: pd.DataFrame = matched_peaks_df.dropna()
 
     for row in tqdm(matched_peaks_df_filtered.itertuples(), total=len(matched_peaks_df_filtered)):
-        image_index = row.Index
+        if row.matched is True:
+            peak_file_name: str = f"{row.lib_mz:.4f}".replace(".", "_") + ".tiff"
+            # load in the corresponding float and integer images
+            float_img: np.ndarray = io.imread(Path(extraction_dir) / "float" / peak_file_name)
+            integer_img: np.ndarray = io.imread(Path(extraction_dir) / "int" / peak_file_name)
 
-        float_img: np.ndarray = image_xr[image_index, ...].values.T
-        integer_img: np.ndarray = (float_img * (2**32 - 1) / np.max(float_img)).astype(np.uint32)
+            img_name: str = row.composition
 
-        img_name: str = row.composition
+            # save floating point image
+            image_utils.save_image(fname=float_dir / f"{img_name}.tiff", data=float_img)
 
-        # save floating point image
-        image_utils.save_image(fname=float_dir / f"{img_name}.tiff", data=float_img)
-
-        # save integer image
-        image_utils.save_image(fname=int_dir / f"{img_name}.tiff", data=integer_img)
+            # save integer image
+            image_utils.save_image(fname=int_dir / f"{img_name}.tiff", data=integer_img)