Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure memory efficiency when working with peak images #25

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 45 additions & 5 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import json
import os
from pathlib import Path
from typing import Generator, List
from typing import Generator, List, Tuple

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -53,6 +53,36 @@ def imz_data(tmp_path_factory: TempPathFactory, rng: np.random.Generator) -> Imz
yield ImzMLParser(filename=output_file_name)


@pytest.fixture(scope="session")
def imz_data_coord_int(tmp_path_factory: TempPathFactory, rng: np.random.Generator) -> ImzMLParser:
# Simplify the previous process for a single coordinate image (1x1)
img_dim: int = 1

# Generate random integers n for each coordinate (1 x 1). These will be used for creating
# random m/z and intensity values of length n.
# Lengths n are distributed along the standard gamma.
ns: np.ndarray = np.rint(rng.standard_gamma(shape=2.5, size=(img_dim**2)) * 100).astype(int)

# Generate random masses and sample different amounts of them, so we get duplicates
total_mzs: np.ndarray = (10000 - 100) * rng.random(size=img_dim**2 * 2) + 100

coords = [(x, y, 1) for x in range(1, img_dim + 1) for y in range(1, img_dim + 1)]

output_file_name: Path = tmp_path_factory.mktemp("data") / "test_data.imzML"

with ImzMLWriter(output_filename=output_file_name, mode="processed") as imzml:
for coord, n in zip(coords, ns):
# Masses: 100 <= mz < 10000, of length n, sampled randomly
mzs = rng.choice(a=total_mzs, size=n)

# Intensities: 0 <= int < 1e8, of length n
ints: np.ndarray = rng.exponential(size=n)

imzml.addSpectrum(mzs=mzs, intensities=ints, coords=coord)

yield ImzMLParser(filename=output_file_name)


@pytest.fixture(scope="session")
def total_mass_df(rng: np.random.Generator) -> pd.DataFrame:
mz_count: int = 10000
Expand All @@ -75,8 +105,8 @@ def percentile_intensities(

@pytest.fixture(scope="session")
def peak_idx_candidates(
total_mass_df: pd.DataFrame, percentile_intensities: tuple[np.ndarray, np.ndarray]
) -> Generator[tuple[np.ndarray, np.ndarray], None, None]:
total_mass_df: pd.DataFrame, percentile_intensities: Tuple[np.ndarray, np.ndarray]
) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]:
_, log_int_percentile = percentile_intensities

peak_candidate_indexes, peak_candidates = extraction.signal_extraction(
Expand All @@ -87,8 +117,8 @@ def peak_idx_candidates(

@pytest.fixture(scope="session")
def peak_widths(
total_mass_df, peak_idx_candidates
) -> Generator[tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray], None, None]:
total_mass_df: pd.DataFrame, peak_idx_candidates: Tuple[np.ndarray, np.ndarray]
) -> Generator[Tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray], None, None]:
peak_candidate_idxs, peak_candidates = peak_idx_candidates
peak_df, l_ips_r, r_ips_r, peak_widths_height = extraction.get_peak_widths(
total_mass_df=total_mass_df,
Expand All @@ -100,6 +130,16 @@ def peak_widths(
yield (peak_df, l_ips_r, r_ips_r, peak_widths_height)


@pytest.fixture(scope="session")
def peak_widths_coord_int(imz_data_coord_int: ImzMLParser):
mzs, intensities = imz_data_coord_int.getspectrum(0)
peak_df = pd.DataFrame({"m/z": mzs, "intensity": intensities})
peak_df["peak"] = (peak_df["m/z"] + 0.04).copy()
peak_df["peak_height"] = 0.001

yield peak_df


@pytest.fixture(scope="session")
def library() -> Generator[pd.DataFrame, None, None]:
lib = pd.DataFrame(data={"mz": [30, 45], "composition": ["A", "B"]})
Expand Down
62 changes: 33 additions & 29 deletions src/maldi_tools/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import numpy as np
import pandas as pd
import xarray as xr
from alpineer.image_utils import save_image
from alpineer.io_utils import list_files, remove_file_extensions, validate_paths
from alpineer.misc_utils import verify_in_list
from pyimzml.ImzMLParser import ImzMLParser
Expand All @@ -27,7 +27,7 @@
from maldi_tools import plotting


def extract_spectra(imz_data: ImzMLParser, intensity_percentile: int) -> tuple[pd.DataFrame, np.ndarray]:
def extract_spectra(imz_data: ImzMLParser, intensity_percentile: int) -> Tuple[pd.DataFrame, np.ndarray]:
"""Iterates over all coordinates after opening the `imzML` data and extracts all masses,
and sums the intensities for all masses. Creates an intensity image, thresholded on
`intensity_percentile` with `np.percentile`. The masses are then sorted.
Expand All @@ -39,7 +39,7 @@ def extract_spectra(imz_data: ImzMLParser, intensity_percentile: int) -> tuple[p

Returns:
-------
tuple[pd.DataFrame, np.ndarray]: A tuple where the first element is the dataframe containing
Tuple[pd.DataFrame, np.ndarray]: A tuple where the first element is the dataframe containing
the total masses and their intensities, and the second element is the thresholds matrix
of the image.
"""
Expand Down Expand Up @@ -70,20 +70,20 @@ def extract_spectra(imz_data: ImzMLParser, intensity_percentile: int) -> tuple[p

def rolling_window(
total_mass_df: pd.DataFrame, intensity_percentile: int, window_size: int = 5000
) -> tuple[np.ndarray, np.ndarray]:
) -> Tuple[np.ndarray, np.ndarray]:
"""Computes the rolling window log intensities and the rolling window log intensity percentiles.

Args:
----
total_mass_df (pd.DataFrame): A dataframe containing all the masses and their
relative intensities.
intensity_percentile (int): The intensity for the quantile calculation.
window_size (int, optional): The sizve of the window for the rolling window method.
window_size (int): The sizve of the window for the rolling window method.
Defaults to 5000.

Returns:
-------
tuple[np.ndarray, np.ndarray]: A tuple where the first element is the log intensities,
Tuple[np.ndarray, np.ndarray]: A tuple where the first element is the log intensities,
and the second element is the log intensity percentiles.
"""
plt_range_min_ind: int = 0
Expand Down Expand Up @@ -241,46 +241,49 @@ def peak_spectra(
return panel_df


def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser) -> xr.DataArray:
def coordinate_integration(peak_df: pd.DataFrame, imz_data: ImzMLParser, extraction_dir: Path) -> None:
"""Integrates the coordinates with the discovered, post-processed peaks and generates an image for
each of the peaks using the imzML coordinate data.

Saves the peak images to specified extraction_dir.

Args:
----
peak_df (pd.DataFrame): The unique peaks from the data.
imz_data (ImzMLParser): The imzML object.

Returns:
-------
xr.DataArray: A data structure which holds all the images for each peak.
extraction_dir (Path): The directory to save extracted data (peak images) in.
"""
unique_peaks = peak_df["peak"].unique()
peak_dict = dict(zip(unique_peaks, np.arange((len(unique_peaks)))))

imz_coordinates: list = imz_data.coordinates

x_size: int = max(imz_coordinates, key=itemgetter(0))[0]
y_size: int = max(imz_coordinates, key=itemgetter(1))[1]

image_shape: Tuple[int, int] = (x_size, y_size)

imgs = np.zeros((len(unique_peaks), *image_shape))
float_peak_dir: Path = Path(extraction_dir) / "float"
int_peak_dir: Path = Path(extraction_dir) / "int"
for img_dir in [float_peak_dir, int_peak_dir]:
if not os.path.exists(img_dir):
img_dir.mkdir(parents=True, exist_ok=True)

for idx, (x, y, _) in tqdm(enumerate(imz_data.coordinates), total=len(imz_data.coordinates)):
mzs, intensities = imz_data.getspectrum(idx)

intensity: np.ndarray = intensities[np.isin(mzs, peak_df["m/z"])]

for i_idx, peak in peak_df.loc[peak_df["m/z"].isin(mzs), "peak"].reset_index(drop=True).items():
imgs[peak_dict[peak], x - 1, y - 1] += intensity[i_idx]

img_data = xr.DataArray(
data=imgs,
coords={"peak": unique_peaks, "x": range(x_size), "y": range(y_size)},
dims=["peak", "x", "y"],
)

return img_data
img_name: str = f"{peak:.4f}".replace(".", "_")
float_peak_path: Path = float_peak_dir / f"{img_name}.tiff"
int_peak_path: Path = int_peak_dir / f"{img_name}.tiff"
peak_exists: bool = os.path.exists(float_peak_path)
peak_img: np.ndarray = imread(float_peak_path).T if peak_exists else np.zeros(image_shape)

peak_img[x - 1, y - 1] += intensity[i_idx]
peak_img_float: np.ndarray = peak_img.T
peak_img_int: np.ndarray = (peak_img_float * (2**32 - 1) / np.max(peak_img_float)).astype(
np.uint32
)
save_image(fname=float_peak_path, data=peak_img_float)
save_image(fname=int_peak_path, data=peak_img_int)


def _matching_vec(obs_mz: pd.Series, library_peak_df: pd.DataFrame, ppm: int) -> pd.Series:
Expand Down Expand Up @@ -312,7 +315,6 @@ def _matching_vec(obs_mz: pd.Series, library_peak_df: pd.DataFrame, ppm: int) ->


def library_matching(
image_xr: xr.DataArray,
library_peak_df: pd.DataFrame,
ppm: int,
extraction_dir: Path,
Expand All @@ -323,19 +325,21 @@ def library_matching(

Args:
----
image_xr (xr.DataArray): A data structure which holds all the images for each peak.
library_peak_df (pd.DataFrame): The library of interest to match the observed peaks with.
ppm (int): The ppm for an acceptable mass error range between the observed mass and any target
mass in the library.
extraction_dir (Path): The directory to save extracted data in.
adducts (bool, optional): Add adducts together. Defaults to False. (Not implemented feature)
adducts (bool): Add adducts together. Defaults to False. (Not implemented feature)

Returns:
-------
pd.DataFrame: Contains the peak, the library target mass, a boolean stating if a match was found
or not, the composition name and the mass error if a match was found or not.
"""
peak_df = pd.DataFrame({"peak": image_xr.peak.to_numpy()})
peak_list: List[float] = [
float(p.replace("_", ".")) for p in remove_file_extensions(list_files(Path(extraction_dir) / "float"))
]
peak_df = pd.DataFrame({"peak": np.array(peak_list)})
match_fun = partial(_matching_vec, library_peak_df=library_peak_df, ppm=ppm)

peak_df[["lib_mz", "matched", "composition", "mass_error"]] = peak_df["peak"].apply(
Expand Down
49 changes: 36 additions & 13 deletions src/maldi_tools/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import skimage.io as io
import xarray as xr
from alpineer import image_utils
from tqdm.notebook import tqdm
Expand Down Expand Up @@ -152,38 +153,60 @@ def save_peak_images(image_xr: xr.DataArray, extraction_dir: Path) -> None:
image_utils.save_image(fname=int_dir / f"{img_name}.tiff", data=integer_img)


def plot_peak_hist(peak: float, bin_count: int, extraction_dir: Path) -> None:
"""Plot a histogram of the intensities of a provided peak image.

Args:
----
peak (float): The desired peak to visualize
bin_count (int): The bin size to use for the histogram
extraction_dir (Path): The directory the peak images are saved in
"""
# verify that the peak provided exists
peak_file: str = f"{peak:.4f}".replace(".", "_")
peak_file = peak_file + ".tiff"
peak_path = Path(extraction_dir) / "float" / peak_file
if not os.path.exists(peak_path):
raise FileNotFoundError(
f"Peak {peak:.4f} does not have a corresponding peak image in {extraction_dir}"
)

# load the peak image in and display histogram
peak_img: np.ndarray = io.imread(peak_path)
plt.hist(peak_img, bins=bin_count)


def save_matched_peak_images(
image_xr: xr.DataArray,
matched_peaks_df: pd.DataFrame,
extraction_dir: Path,
) -> None:
"""Saves the images which were matched with the library.

Args:
----
image_xr (xr.DataArray): A data structure which holds all the images for each peak.
matched_peaks_df (pd.DataFrame): A dataframe containing the peaks matched with the library.
extraction_dir (Path): The directory to save extracted data in.
"""
# Create image directories if they do not exist
float_dir: Path = extraction_dir / "library_matched" / "float"
int_dir: Path = extraction_dir / "library_matched" / "int"
float_dir: Path = Path(extraction_dir) / "library_matched" / "float"
int_dir: Path = Path(extraction_dir) / "library_matched" / "int"
for img_dir in [float_dir, int_dir]:
if not os.path.exists(img_dir):
img_dir.mkdir(parents=True, exist_ok=True)

matched_peaks_df_filtered: pd.DataFrame = matched_peaks_df.dropna()

for row in tqdm(matched_peaks_df_filtered.itertuples(), total=len(matched_peaks_df_filtered)):
image_index = row.Index
if row.matched is True:
peak_file_name: str = f"{row.lib_mz:.4f}".replace(".", "_") + ".tiff"
# load in the corresponding float and integer images
float_img: np.ndarray = io.imread(Path(extraction_dir) / "float" / peak_file_name)
integer_img: np.ndarray = io.imread(Path(extraction_dir) / "int" / peak_file_name)

float_img: np.ndarray = image_xr[image_index, ...].values.T
integer_img: np.ndarray = (float_img * (2**32 - 1) / np.max(float_img)).astype(np.uint32)
img_name: str = row.composition

img_name: str = row.composition
# save floating point image
image_utils.save_image(fname=float_dir / f"{img_name}.tiff", data=float_img)

# save floating point image
image_utils.save_image(fname=float_dir / f"{img_name}.tiff", data=float_img)

# save integer image
image_utils.save_image(fname=int_dir / f"{img_name}.tiff", data=integer_img)
# save integer image
image_utils.save_image(fname=int_dir / f"{img_name}.tiff", data=integer_img)
Loading
Loading