From 5d022e1fbdddc4df50352b87e45183bc7fca7980 Mon Sep 17 00:00:00 2001 From: AliceJoubert <158147135+AliceJoubert@users.noreply.github.com> Date: Fri, 30 Aug 2024 13:44:44 +0200 Subject: [PATCH] [CONV] Implementing new converter for IXI dataset (#1239) * wip1 * wip2 * add todos * wip2 * add dti * Fix concat img * print1 * Sessions/scans.tsv * Testing1 * Function description * Testing2 * fix * Add docstrings * Changes as suggested * add func subj_from_file * Add test func * Add participants.tsv * Fix prints * cleanup * unittesting1 * Changes according to suggestions * unittesting2 * unittesting 3 * add mapping class * more unittesting * Suggestions + testing * Finish unit tests * Add test for check_modalities * Changes according to suggestions * Change check_mod * use ClinicalDataMapping enum --- clinica/iotools/bids_utils.py | 28 + .../converters/adni_to_bids/adni_utils.py | 1 - clinica/iotools/converters/cli.py | 2 + clinica/iotools/converters/factory.py | 4 + .../converters/ixi_to_bids/__init__.py | 3 + .../converters/ixi_to_bids/ixi_to_bids.py | 82 +++ .../converters/ixi_to_bids/ixi_to_bids_cli.py | 27 + .../ixi_to_bids/ixi_to_bids_utils.py | 433 ++++++++++++ .../ixi_to_bids/test_ixi_to_bids_utils.py | 633 ++++++++++++++++++ test/unittests/iotools/test_bids_utils.py | 53 ++ 10 files changed, 1265 insertions(+), 1 deletion(-) create mode 100644 clinica/iotools/converters/ixi_to_bids/__init__.py create mode 100644 clinica/iotools/converters/ixi_to_bids/ixi_to_bids.py create mode 100644 clinica/iotools/converters/ixi_to_bids/ixi_to_bids_cli.py create mode 100644 clinica/iotools/converters/ixi_to_bids/ixi_to_bids_utils.py create mode 100644 test/unittests/iotools/converters/ixi_to_bids/test_ixi_to_bids_utils.py diff --git a/clinica/iotools/bids_utils.py b/clinica/iotools/bids_utils.py index 449a3e1aa..de47302ad 100644 --- a/clinica/iotools/bids_utils.py +++ b/clinica/iotools/bids_utils.py @@ -25,6 +25,7 @@ class StudyName(str, Enum): OASIS = "OASIS" OASIS3 = "OASIS3" UKB = "UKB" + IXI = "IXI" BIDS_VALIDATOR_CONFIG = { @@ -90,6 +91,8 @@ def bids_id_factory(study: StudyName) -> Type[BIDSSubjectID]: return OASIS3BIDSSubjectID if study == StudyName.HABS: return HABSBIDSSubjectID + if study == StudyName.IXI: + return IXIBIDSSubjectID class ADNIBIDSSubjectID(BIDSSubjectID): @@ -292,6 +295,31 @@ def to_original_study_id(self) -> str: return str(self.replace("sub-HABS", "P_")) +class IXIBIDSSubjectID(BIDSSubjectID): + """Implementation for IXI of the BIDSSubjectIDClass, allowing to go from the source id IXI### + to a bids id sub-IXI### and reciprocally.""" + + def validate(self, value: str) -> str: + if re.fullmatch(r"sub-IXI\d{3}", value): + return value + raise ValueError( + f"BIDS IXI subject ID {value} is not properly formatted. " + "Expecting a 'sub-IXIXXX' format." + ) + + @classmethod + def from_original_study_id(cls, study_id: str) -> str: + if re.fullmatch(r"IXI\d{3}", study_id): + return f"sub-{study_id}" + raise ValueError( + f"Raw IXI subject ID {study_id} is not properly formatted. " + "Expecting a 'Y' format." + ) + + def to_original_study_id(self) -> str: + return str(self.replace("sub-", "")) + + # -- Methods for the clinical data -- def create_participants_df( study_name: StudyName, diff --git a/clinica/iotools/converters/adni_to_bids/adni_utils.py b/clinica/iotools/converters/adni_to_bids/adni_utils.py index 308ec7ff6..2ac56a99d 100644 --- a/clinica/iotools/converters/adni_to_bids/adni_utils.py +++ b/clinica/iotools/converters/adni_to_bids/adni_utils.py @@ -33,7 +33,6 @@ def _define_subjects_list( source_dir: Path, subjs_list_path: Optional[Path] = None, ) -> List[str]: - # todo : here or in utils for all converters ? import re from clinica.utils.stream import cprint diff --git a/clinica/iotools/converters/cli.py b/clinica/iotools/converters/cli.py index 82fde5e15..5c10c7b54 100644 --- a/clinica/iotools/converters/cli.py +++ b/clinica/iotools/converters/cli.py @@ -4,6 +4,7 @@ from .aibl_to_bids import aibl_to_bids_cli from .genfi_to_bids import genfi_to_bids_cli from .habs_to_bids import habs_to_bids_cli +from .ixi_to_bids import ixi_to_bids_cli from .nifd_to_bids import nifd_to_bids_cli from .oasis3_to_bids import oasis3_to_bids_cli from .oasis_to_bids import oasis_to_bids_cli @@ -24,6 +25,7 @@ def cli() -> None: cli.add_command(oasis3_to_bids_cli.cli) cli.add_command(ukb_to_bids_cli.cli) cli.add_command(genfi_to_bids_cli.cli) +cli.add_command(ixi_to_bids_cli.cli) if __name__ == "__main__": cli() diff --git a/clinica/iotools/converters/factory.py b/clinica/iotools/converters/factory.py index 2e9a74d6b..d7e1baf99 100644 --- a/clinica/iotools/converters/factory.py +++ b/clinica/iotools/converters/factory.py @@ -38,6 +38,8 @@ def get_converter_name(study: Union[str, StudyName]) -> str: return "Oasis3ToBids" if study == StudyName.UKB: return "UkbToBids" + if study == StudyName.IXI: + return "IxiToBids" def converter_factory(study: Union[str, StudyName]) -> Callable: @@ -58,4 +60,6 @@ def converter_factory(study: Union[str, StudyName]) -> Callable: from .oasis3_to_bids import convert if study == StudyName.UKB: from .ukb_to_bids import convert + if study == StudyName.IXI: + from .ixi_to_bids import convert return convert diff --git a/clinica/iotools/converters/ixi_to_bids/__init__.py b/clinica/iotools/converters/ixi_to_bids/__init__.py new file mode 100644 index 000000000..13f533353 --- /dev/null +++ b/clinica/iotools/converters/ixi_to_bids/__init__.py @@ -0,0 +1,3 @@ +from .ixi_to_bids import convert + +__all__ = ["convert"] diff --git a/clinica/iotools/converters/ixi_to_bids/ixi_to_bids.py b/clinica/iotools/converters/ixi_to_bids/ixi_to_bids.py new file mode 100644 index 000000000..5b5176487 --- /dev/null +++ b/clinica/iotools/converters/ixi_to_bids/ixi_to_bids.py @@ -0,0 +1,82 @@ +"""Convert IXI dataset (https://brain-development.org/ixi-dataset/) to BIDS.""" + +from pathlib import Path +from typing import Optional + +import nibabel as nb +import numpy as np + +from clinica.iotools.bids_utils import write_modality_agnostic_files +from clinica.iotools.converters.ixi_to_bids.ixi_to_bids_utils import ( + check_modalities, + define_participants, + read_clinical_data, + write_participants, + write_scans, + write_sessions, + write_subject_data, +) +from clinica.utils.filemanip import UserProvidedPath + +__all__ = ["convert"] + + +def convert( + path_to_dataset: UserProvidedPath, + bids_dir: UserProvidedPath, + path_to_clinical: UserProvidedPath, + subjects: Optional[UserProvidedPath] = None, + n_procs: Optional[int] = 1, + **kwargs, +): + from clinica.iotools.bids_utils import StudyName + from clinica.iotools.converters.factory import get_converter_name + from clinica.utils.stream import cprint + + from ..utils import validate_input_path + + path_to_dataset = validate_input_path(path_to_dataset) + bids_dir = validate_input_path(bids_dir, check_exist=False) + path_to_clinical = validate_input_path(path_to_clinical) + if subjects: + subjects = validate_input_path(subjects) + + if n_procs != 1: + cprint( + f"{get_converter_name(StudyName.IXI)} converter does not support multiprocessing yet. n_procs set to 1.", + lvl="warning", + ) + + clinical_data = read_clinical_data(path_to_clinical) + participants = define_participants(path_to_dataset, subjects) + check_modalities(data_directory=path_to_dataset, participants=participants) + + write_participants( + bids_dir=bids_dir, clinical_data=clinical_data, participants=participants + ) + + for participant in participants: + cprint(f"Converting IXI subject {participant} to BIDS", lvl="debug") + write_subject_data( + bids_dir=bids_dir, participant=participant, path_to_dataset=path_to_dataset + ) + write_sessions( + bids_dir=bids_dir, participant=participant, clinical_data=clinical_data + ) + write_scans(bids_dir=bids_dir, participant=participant) + + readme_data = { + "link": "https://brain-development.org/ixi-dataset/", + "desc": ( + "IXI is the nickname for the Information eXtraction from Images project, " + "which issued a dataset of nearly 600 images from healthy subjects. The MR" + "acquisition protocol includes T1,T2, PD weighted, MRA and diffusion-weighted" + "images. Three hospitals in London were involved in data collection." + ), + } + + write_modality_agnostic_files( + study_name=StudyName.IXI, readme_data=readme_data, bids_dir=bids_dir + ) + + cprint("Conversion to BIDS finished.", lvl="info") diff --git a/clinica/iotools/converters/ixi_to_bids/ixi_to_bids_cli.py b/clinica/iotools/converters/ixi_to_bids/ixi_to_bids_cli.py new file mode 100644 index 000000000..dfd36074e --- /dev/null +++ b/clinica/iotools/converters/ixi_to_bids/ixi_to_bids_cli.py @@ -0,0 +1,27 @@ +from os import PathLike +from typing import Optional + +import click + +from clinica.iotools.converters import cli_param + + +@click.command(name="ixi-to-bids") +@cli_param.dataset_directory +@cli_param.bids_directory +@cli_param.clinical_data_directory +@cli_param.subjects_list +def cli( + dataset_directory: PathLike, + bids_directory: PathLike, + clinical_data_directory: PathLike, + subjects_list: Optional[PathLike] = None, +) -> None: + """IXI to BIDS converter.""" + from .ixi_to_bids import convert + + convert(dataset_directory, bids_directory, clinical_data_directory, subjects_list) + + +if __name__ == "__main__": + cli() diff --git a/clinica/iotools/converters/ixi_to_bids/ixi_to_bids_utils.py b/clinica/iotools/converters/ixi_to_bids/ixi_to_bids_utils.py new file mode 100644 index 000000000..8bebe8643 --- /dev/null +++ b/clinica/iotools/converters/ixi_to_bids/ixi_to_bids_utils.py @@ -0,0 +1,433 @@ +import json +import re +import shutil +from enum import Enum +from pathlib import Path +from typing import List, Optional, Union + +import nibabel as nib +import pandas as pd +from nilearn.image import concat_imgs + +from clinica.iotools.bids_utils import StudyName, bids_id_factory +from clinica.utils.stream import cprint, log_and_raise + +__all__ = [ + "read_clinical_data", + "define_participants", + "write_subject_data", + "write_sessions", + "write_scans", + "write_participants", + "check_modalities", +] + + +def _get_subjects_list_from_data(data_directory: Path) -> List[str]: + return list( + dict.fromkeys( + re.match(r"IXI\d{3}", path.name).group(0) + for path in data_directory.rglob(pattern="IXI*.nii.gz") + if re.match(r"IXI\d{3}", path.name) + ) + ) + + +def _get_subjects_list_from_file(subjs_list_path: Path) -> List[str]: + return subjs_list_path.read_text().splitlines() + + +def define_participants( + data_directory: Path, + subjs_list_path: Optional[Path] = None, +) -> List[str]: + """ + Defines the actual list of participants based on the (provided) subjects list filtered + using existing data. + + Parameters + ---------- + data_directory : Path to the raw data directory. + subjs_list_path : [Optional] Path to a text file containing a list of specific subjects to extract. + + Returns + ------- + The list of ids that were either present in the data directory or asked for with a specific text file, provided + associated images actually exists. + + """ + + list_from_data = _get_subjects_list_from_data(data_directory) + if subjs_list_path is None: + return list_from_data + cprint("Loading a subjects list provided by the user...") + list_from_file = _get_subjects_list_from_file(subjs_list_path) + list_filtered = [subject for subject in list_from_file if subject in list_from_data] + invalid_subjects = list(set(list_from_file) - set(list_filtered)) + if invalid_subjects: + cprint( + f"The subjects : {' , '.join(invalid_subjects)} do not have any associated data inside the directory {data_directory}" + f" and can not be converted." + ) + return list_filtered + + +def _rename_clinical_data_to_bids(column: str) -> str: + if column == "SEX_ID (1=m, 2=f)": + return "sex" + if column == "ETHNIC_ID": + return "ethnicity" + if column == "MARITAL_ID": + return "marital status" + if column == "OCCUPATION_ID": + return "occupation" + if column == "QUALIFICATION_ID": + return "qualification" + if column == "IXI_ID": + return "source_id" + if column == "DOB": + return "date of birth" + if column == "STUDY_DATE": + return "acq_time" + else: + return column.lower() + + +def _get_sex_mapping() -> pd.DataFrame: + return pd.DataFrame({"SEX": ["male", "female"]}, index=[1, 2])["SEX"] + + +class ClinicalDataMapping(str, Enum): + ETHNIC = "Ethnicity" + MARITAL = "Marital Status" + OCCUPATION = "Occupation" + QUALIFICATION = "Qualification" + + +def _get_mapping(clinical_data_path: Path, map: ClinicalDataMapping) -> pd.Series: + try: + return pd.read_excel( + clinical_data_path / "IXI.xls", sheet_name=map.value + ).set_index("ID")[map.name] + except FileNotFoundError: + log_and_raise( + f"Clinical data stored in the folder {clinical_data_path} is expected to be an excel file named 'IXI.xls'. " + f"In case the file downloaded from the IXI website changed format, please do not hesitate to report to us !", + FileNotFoundError, + ) + except (ValueError, KeyError): + log_and_raise( + f"{map.value} mapping is expected to be contained in a sheet called {map.value} coming from the clinical data excel. " + f"Possibilities are supposed to be described in a {map.name} column associated to keys from the 'ID' column. " + f"In case the file downloaded from the IXI website changed format, please do not hesitate to report to us !", + ValueError, + ) + + +def _padding_source_id(source_id: Union[str, int]) -> str: + if len(str(source_id)) > 3: + log_and_raise( + f"The source id {source_id} has more than 3 digits while IXI" + f"source ids are expected to be between 1 and 3 digits.", + ValueError, + ) + return f"IXI{'0'* (3 - len(str(source_id))) + str(source_id)}" + + +def read_clinical_data(clinical_data_path: Path) -> pd.DataFrame: + """ + Reads and formats IXI clinical data. + + Parameters + ---------- + clinical_data_path : Path to the directory where the clinical data .xls is stored. + + Returns + ------- + A dataframe containing the clinical data, with some modifications: padded study id and added session id. + + """ + try: + clinical_data = pd.read_excel(clinical_data_path / "IXI.xls") + except FileNotFoundError: + log_and_raise( + f"Clinical data stored in the folder {clinical_data_path} is expected to be an excel file named 'IXI.xls'. " + f"In case the file downloaded from the IXI website changed format, please do not hesitate to report to us !", + FileNotFoundError, + ) + else: + if "DATE_AVAILABLE" in clinical_data.columns: + clinical_data.drop("DATE_AVAILABLE", axis=1, inplace=True) + clinical_data["SEX_ID (1=m, 2=f)"] = clinical_data["SEX_ID (1=m, 2=f)"].map( + _get_sex_mapping() + ) + for mapping in ClinicalDataMapping: + clinical_data[f"{mapping.name}_ID"] = clinical_data[ + f"{mapping.name}_ID" + ].map(_get_mapping(clinical_data_path, mapping)) + + clinical_data["IXI_ID"] = clinical_data.IXI_ID.apply( + lambda x: _padding_source_id(x) + ) + clinical_data.rename( + lambda x: _rename_clinical_data_to_bids(x), axis=1, inplace=True + ) + clinical_data.fillna("n/a", inplace=True) + clinical_data["session_id"] = "ses-M000" + return clinical_data + + +def _rename_modalities(input_mod: str) -> str: + if input_mod == "T1": + return "T1w" + if input_mod == "T2": + return "T2w" + if input_mod == "MRA": + return "angio" + if input_mod == "PD": + return "PDw" + if input_mod == "DTI": + return "dti" + raise ValueError(f"The modality {input_mod} is not recognized in the IXI dataset.") + + +def _define_magnetic_field(hospital: str) -> str: + if hospital in ("Guys", "IOP"): + return "1.5" + if hospital == "HH": + return "3" + raise ValueError(f"The hospital {hospital} was not recognized.") + + +def _get_img_data(data_directory: Path) -> pd.DataFrame: + """Finds paths for all images that are not DTI data and processes the info contained in their names""" + df = pd.DataFrame( + { + "img_path": [ + path + for path in data_directory.rglob(pattern="IXI*.nii.gz") + if re.search(r"IXI\d{3}(-\w*){3}.nii.gz$", str(path)) + ] + } + ) + df = ( + df.assign(img_name=lambda df: df.img_path.apply(lambda x: x.name)) + .assign(img_name_no_ext=lambda df: df.img_name.apply(lambda x: x.split(".")[0])) + .assign(subject=lambda df: df.img_name_no_ext.apply(lambda x: x.split("-")[0])) + .assign( + participant_id=lambda df: df.subject.apply( + lambda x: bids_id_factory(StudyName.IXI).from_original_study_id(x) + ) + ) + .assign(hospital=lambda df: df.img_name_no_ext.apply(lambda x: x.split("-")[1])) + .assign( + modality=lambda df: df.img_name_no_ext.apply( + lambda x: _rename_modalities(x.split("-")[3]) + ) + ) + .assign(field=lambda df: df.hospital.apply(lambda x: _define_magnetic_field(x))) + .assign(session="ses-M000") + ) + return df + + +def _get_bids_filename_from_image_data(img: pd.Series) -> str: + return f"{img['participant_id']}_{img['session']}_{img['modality']}" + + +def write_subject_data(bids_dir: Path, participant: str, path_to_dataset: Path) -> None: + """ + Writes the data of the IXI subject in the BIDS directory following BIDS specifications. + + Parameters + ---------- + bids_dir : Path to the output BIDS directory. + participant : Current converted subject study id (str). + path_to_dataset : Path to the raw dataset directory. + """ + data_df = _get_img_data(path_to_dataset) + _write_subject_no_dti(data_df[data_df["subject"] == participant], bids_dir) + _write_subject_dti_if_exists(bids_dir, participant, path_to_dataset) + + +def _write_json_image(writing_path: Path, hospital: str, field: str) -> None: + """ + Writes a json associated to one IXI image. + + Parameters + ---------- + writing_path : Path indicating under what name to write the json. + hospital : identifier for the hospital responsible for the acquisition, determined from the image filename. + field : magnetic field used for the acquisition of the image, determined from the hospital. + """ + with open(writing_path, "w") as f: + json.dump( + { + "InstitutionName": hospital, + "MagneticFieldStrength (T)": field, + }, + f, + indent=4, + ) + + +def _write_subject_no_dti(subject_df: pd.DataFrame, bids_path: Path) -> None: + """Copies all subject data but DTI""" + for _, row in subject_df.iterrows(): + cprint( + f"Converting modality {row['modality']} for subject {row['subject']}.", + lvl="debug", + ) + filename = _get_bids_filename_from_image_data(row) + data_path = bids_path / row["participant_id"] / row["session"] / "anat" + data_path.mkdir(parents=True, exist_ok=True) + shutil.copy2(row["img_path"], f"{data_path}/{filename}.nii.gz") + _write_json_image( + (data_path / filename).with_suffix(".json"), row["hospital"], row["field"] + ) + + +def _write_subject_dti_if_exists( + bids_path: Path, subject: str, data_directory: Path +) -> None: + """Processes DTI data if found for a subject""" + if dti_paths := _find_subject_dti_data(data_directory, subject): + cprint(f"Converting modality DTI for subject {subject}.", lvl="debug") + dti_to_save = _merge_dti(dti_paths) + bids_id = bids_id_factory(StudyName.IXI).from_original_study_id(subject) + data_path = bids_path / bids_id / "ses-M000" / "dwi" + data_path.mkdir(parents=True, exist_ok=True) + filename = Path(f"{bids_id}_ses-M000_dwi") + dti_to_save.to_filename(data_path / filename.with_suffix(".nii.gz")) + hospital = dti_paths[0].name.split("-")[1] + _write_json_image( + data_path / filename.with_suffix(".json"), + hospital, + _define_magnetic_field(hospital), + ) + + +def _find_subject_dti_data(data_directory: Path, subject: str) -> List[Path]: + pattern = subject + r"(-\w*){2}-DTI(-\w*){1}.nii.gz$" + return [ + path + for path in data_directory.rglob(pattern="IXI*.nii.gz") + if re.search(pattern, str(path)) + ] + + +def _merge_dti(dti_images: List[Path]) -> nib.Nifti1Image: + return concat_imgs([nib.load(img) for img in dti_images]) + + +def write_scans(bids_dir: Path, participant: str) -> None: + """ + Write the scans.tsv for the only session (ses-M000) of a IXI subject. + + Parameters + ---------- + bids_dir : Path to the output BIDS directory. + participant : Current converted subject study id (str). + """ + bids_id = bids_id_factory(StudyName.IXI).from_original_study_id(participant) + to_write = pd.DataFrame( + { + "filename": [ + f"{path.parent.name}/{path.name}" + for path in (bids_dir / bids_id).rglob(f"{bids_id}*.nii.gz") + ] + } + ) + to_write.to_csv( + bids_dir / bids_id / "ses-M000" / f"{bids_id}_ses-M000_scans.tsv", + sep="\t", + index=False, + ) + + +def write_sessions( + bids_dir: Path, clinical_data: pd.DataFrame, participant: str +) -> None: + """ + Writes the sessions.tsv for a IXI subject. + + Parameters + ---------- + bids_dir : Path to the output BIDS directory. + clinical_data : Dataframe containing the formatted clinical data of the IXI study. + participant : Current converted subject study id (str). + """ + line = clinical_data[clinical_data["source_id"] == participant] + bids_id = bids_id_factory(StudyName.IXI).from_original_study_id(participant) + line[["source_id", "session_id", "acq_time"]].to_csv( + bids_dir / bids_id / f"{bids_id}_sessions.tsv", sep="\t", index=False + ) + + +def write_participants( + bids_dir: Path, clinical_data: pd.DataFrame, participants: List[str] +) -> None: + """ + Write the participants.tsv at the root of the BIDS directory. + + Parameters + ---------- + bids_dir : Path to the output BIDS directory. + clinical_data : Dataframe containing the formatted clinical data of the IXI study. + participants : List of converted subjects study source ids. + """ + clinical_data.set_index("source_id", inplace=True, drop=False) + clinical_data.assign( + participant_id=clinical_data.source_id.apply( + lambda x: bids_id_factory(StudyName.IXI).from_original_study_id(x) + ) + ) + for participant in participants: + if participant not in clinical_data.index: + clinical_data.loc[participant] = "n/a" + clinical_data.loc[participant, "source_id"] = participant + if not bids_dir.exists(): + bids_dir.mkdir() + clinical_data.loc[participants].drop(["acq_time", "session_id"], axis=1).to_csv( + bids_dir / "participants.tsv", sep="\t", index=False, na_rep="n/a" + ) + clinical_data.reset_index(drop=True, inplace=True) + + +def _identify_expected_modalities(data_directory: Path) -> List[str]: + return [ + p.name.split("-")[1] + for p in data_directory.iterdir() + if p.is_dir() and "IXI-" in str(p) + ] + + +def check_modalities(data_directory: Path, participants: List[str]) -> None: + """ + Verify what modality folders are available in the given data directory and checks if some are missing per participant + + Parameters + ---------- + data_directory : Path to raw dataset. + participants : List of the subject ids of all participants + + """ + expected_modalities = sorted(_identify_expected_modalities(data_directory)) + message = ( + f"Modalities : {' , '.join(_rename_modalities(mod) for mod in expected_modalities)}" + f" were identified inside {data_directory} for conversion.\n" + ) + participants_missing_mod = dict() + for participant in participants: + missing_mods = [] + for mod in expected_modalities: + if not list(data_directory.rglob(f"{participant}*{mod}*.nii.gz")): + missing_mods += [_rename_modalities(mod)] + if missing_mods: + participants_missing_mod[participant] = missing_mods + if participants_missing_mod: + message += f"Some subjects do not have data for the following modalities :\n" + for sub, mod in participants_missing_mod.items(): + message += f"{sub} : {' , '.join(mod)}\n" + + cprint(message) diff --git a/test/unittests/iotools/converters/ixi_to_bids/test_ixi_to_bids_utils.py b/test/unittests/iotools/converters/ixi_to_bids/test_ixi_to_bids_utils.py new file mode 100644 index 000000000..16b274d16 --- /dev/null +++ b/test/unittests/iotools/converters/ixi_to_bids/test_ixi_to_bids_utils.py @@ -0,0 +1,633 @@ +import json +from pathlib import Path +from unittest.mock import patch + +import nibabel +import numpy as np +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal, assert_series_equal + +from clinica.iotools.converters.ixi_to_bids.ixi_to_bids_utils import ( + ClinicalDataMapping, + _define_magnetic_field, + _find_subject_dti_data, + _get_bids_filename_from_image_data, + _get_img_data, + _get_mapping, + _get_subjects_list_from_data, + _get_subjects_list_from_file, + _identify_expected_modalities, + _merge_dti, + _padding_source_id, + _rename_clinical_data_to_bids, + _rename_modalities, + _write_json_image, + _write_subject_dti_if_exists, + _write_subject_no_dti, + check_modalities, + define_participants, + read_clinical_data, + write_participants, + write_scans, + write_sessions, +) + + +def test_get_subjects_list_from_data(tmp_path): + for filename in ("IXI1", "IXI123", "IXIaaa", "foo"): + (tmp_path / f"{filename}_T1w.nii.gz").touch() + assert _get_subjects_list_from_data(tmp_path) == ["IXI123"] + + +def test_get_subjects_list_from_file(tmp_path): + with open(tmp_path / "subjects.txt", "w") as f: + f.write("IXI123\nIXI001") + assert _get_subjects_list_from_file(tmp_path / "subjects.txt") == [ + "IXI123", + "IXI001", + ] + + +def test_define_participants_filter(tmp_path): + for filename in ("IXI001", "IXI002", "IXI003", "IXI004"): + (tmp_path / f"{filename}_T1w.nii.gz").touch() + with open(tmp_path / "subjects.txt", "w") as f: + f.write("IXI001\nIXI006") + assert define_participants( + data_directory=tmp_path, subjs_list_path=tmp_path / "subjects.txt" + ) == ["IXI001"] + + +def test_define_participants_optional(tmp_path): + for filename in ("IXI001", "IXI002"): + (tmp_path / f"{filename}_T1w.nii.gz").touch() + assert define_participants(data_directory=tmp_path) == ["IXI001", "IXI002"] + + +@pytest.mark.parametrize( + "input_str, expected", + [ + ("T1", "T1w"), + ("T2", "T2w"), + ("MRA", "angio"), + ("PD", "PDw"), + ("DTI", "dti"), + ], +) +def test_rename_ixi_modalities_success(input_str, expected): + assert _rename_modalities(input_str) == expected + + +@pytest.mark.parametrize("input_str", ["t1", "foo", "T1w"]) +def test_rename_ixi_modalities_error(input_str): + with pytest.raises( + ValueError, + match=f"The modality {input_str} is not recognized in the IXI dataset.", + ): + _rename_modalities(input_str) + + +@pytest.mark.parametrize( + "input_str, expected", + [ + ("SEX_ID (1=m, 2=f)", "sex"), + ("ETHNIC_ID", "ethnicity"), + ("MARITAL_ID", "marital status"), + ("OCCUPATION_ID", "occupation"), + ("QUALIFICATION_ID", "qualification"), + ("IXI_ID", "source_id"), + ("DOB", "date of birth"), + ("STUDY_DATE", "acq_time"), + ("FOO", "foo"), + ], +) +def test_rename_clinical_data_to_bids(input_str, expected): + assert _rename_clinical_data_to_bids(input_str) == expected + + +@pytest.mark.parametrize( + "input, expected", + [ + ("1", "IXI001"), + ("12", "IXI012"), + ("123", "IXI123"), + (1, "IXI001"), + ], +) +def test_padding_source_id_success(input, expected): + assert _padding_source_id(input) == expected + + +def test_padding_source_id_error(): + with pytest.raises( + ValueError, + match=f"The source id 1234 has more than 3 digits while IXI" + f"source ids are expected to be between 1 and 3 digits.", + ): + _padding_source_id("1234") + + +@pytest.mark.parametrize("input", ["IXI_name.xls", "IXI_format.csv"]) +def test_read_clinical_data_error(tmp_path, input): + (tmp_path / input).touch() + with pytest.raises( + FileNotFoundError, + match=f"Clinical data stored in the folder {tmp_path} is expected to be an excel file named 'IXI.xls'. " + f"In case the file downloaded from the IXI website changed format, please do not hesitate to report to us !", + ): + read_clinical_data(tmp_path) + + +def clinical_data_builder(tmp_path: Path) -> None: + marital = pd.DataFrame( + { + "ID": [1, 2, 4, 3, 5], + "MARITAL": [ + "Single", + "Married", + "Divorced or Separated", + "Cohabiting", + "Widowed", + ], + } + ) + + ethnic = pd.DataFrame( + { + "ID": [1, 4, 3, 5, 6], + "ETHNIC": [ + "White", + "Black or BlackBritish", + "Asian or Asian British", + "Chinese", + "Other", + ], + } + ) + + occup = pd.DataFrame( + { + "ID": [1, 2, 3, 4, 5, 6, 7, 8], + "OCCUPATION": [ + "Go out to full time employment", + "Go out to part time employment (<25hrs)", + "Study at college or university", + "Full-time housework", + "Retired", + "Unemployed", + "Work for pay at home", + "Other", + ], + } + ) + + qualif = pd.DataFrame( + { + "ID": [1, 2, 3, 4, 5], + "QUALIFICATION": [ + "No qualifications", + "O - levels, GCSEs, or CSEs", + "A - levels", + "Further education e.g.City & Guilds / NVQs", + "University or Polytechnic degree", + ], + } + ) + + subject = pd.DataFrame( + { + "IXI_ID": ["1"], + "STUDY_DATE": ["2024-08-23"], + "SEX_ID (1=m, 2=f)": [2], + "ETHNIC_ID": [6], + "MARITAL_ID": [1], + "OCCUPATION_ID": [8], + "QUALIFICATION_ID": [3], + "DOB": ["2000-01-01"], + "WEIGHT": [80], + } + ) + with pd.ExcelWriter(tmp_path / "IXI.xls") as writer: + subject.to_excel(writer, index=False) + qualif.to_excel(writer, sheet_name="Qualification", index=False) + occup.to_excel(writer, sheet_name="Occupation", index=False) + ethnic.to_excel(writer, sheet_name="Ethnicity", index=False) + marital.to_excel(writer, sheet_name="Marital Status", index=False) + + +def test_read_clinical_data_success(tmp_path): + clinical_data_builder(tmp_path) + assert ( + read_clinical_data(tmp_path).eq(formatted_clinical_data_builder()).all().all() + ) + + +def test_merge_dti(tmp_path): + im1 = nibabel.Nifti1Image( + np.empty(shape=(256, 156, 256), dtype=np.float64), np.eye(4) + ) + im1.to_filename(tmp_path / "im1.nii.gz") + im2 = nibabel.Nifti1Image( + np.empty(shape=(256, 156, 256), dtype=np.float64), np.eye(4) + ) + im2.to_filename(tmp_path / "im2.nii.gz") + merged = _merge_dti([tmp_path / "im1.nii.gz", tmp_path / "im2.nii.gz"]) + assert type(merged) == nibabel.Nifti1Image + assert merged.shape[-1] == 2 + + +def test_write_dti_success(tmp_path): + im1 = nibabel.Nifti1Image( + np.empty(shape=(256, 156, 256), dtype=np.float64), np.eye(4) + ) + im1.to_filename(tmp_path / "IXI001-Guys-1234-DTI-00.nii.gz") + im2 = nibabel.Nifti1Image( + np.empty(shape=(256, 156, 256), dtype=np.float64), np.eye(4) + ) + im2.to_filename(tmp_path / "IXI001-Guys-1234-DTI-01.nii.gz") + + _write_subject_dti_if_exists( + bids_path=tmp_path, subject="IXI001", data_directory=tmp_path + ) + dti_image = list(tmp_path.rglob(pattern="*dwi.nii.gz")) + dti_json = list(tmp_path.rglob(pattern="*dwi.json")) + + assert ( + len(dti_image) == 1 + and dti_image[0] + == tmp_path + / "sub-IXI001" + / "ses-M000" + / "dwi" + / "sub-IXI001_ses-M000_dwi.nii.gz" + ) + assert ( + len(dti_json) == 1 + and dti_json[0] + == tmp_path / "sub-IXI001" / "ses-M000" / "dwi" / "sub-IXI001_ses-M000_dwi.json" + ) + + +def test_write_dti_empty(tmp_path): + _write_subject_dti_if_exists( + bids_path=tmp_path, subject="IXI001", data_directory=tmp_path + ) + dti_files = list(tmp_path.rglob(pattern="*dwi.nii.gz")) + assert not dti_files + + +@pytest.mark.parametrize( + "input_str, expected", + [ + ("Guys", "1.5"), + ("IOP", "1.5"), + ("HH", "3"), + ], +) +def test_define_magnetic_field_success(input_str, expected): + assert _define_magnetic_field(input_str) == expected + + +def test_define_magnetic_field_error(): + with pytest.raises( + ValueError, + match=f"The hospital foo was not recognized.", + ): + _define_magnetic_field("foo") + + +def image_dataframe_builder(tmp_path: Path) -> pd.DataFrame: + tmp_image = tmp_path / "IXI001-Guys-1234-T1.nii.gz" + tmp_image.touch() + (tmp_path / "IXI001-Guys-1234-T1").touch() + (tmp_path / "IXI001-Guys-T1.nii.gz").touch() + (tmp_path / "IXI001-Guys-1234-T1-00.nii.gz").touch() + + return pd.DataFrame( + { + "img_path": [tmp_image], + "img_name": ["IXI001-Guys-1234-T1.nii.gz"], + "img_name_no_ext": ["IXI001-Guys-1234-T1"], + "subject": ["IXI001"], + "participant_id": ["sub-IXI001"], + "hospital": ["Guys"], + "modality": ["T1w"], + "field": ["1.5"], + "session": ["ses-M000"], + } + ) + + +def test_get_image_data(tmp_path): + input = image_dataframe_builder(tmp_path) + assert assert_frame_equal(_get_img_data(tmp_path), input) is None + + +def test_get_bids_filename_from_image_data(tmp_path): + input = image_dataframe_builder(tmp_path) + assert _get_bids_filename_from_image_data(input.loc[0]) == "sub-IXI001_ses-M000_T1w" + + +def test_get_marital_mapping_success(tmp_path): + marital = pd.DataFrame( + { + "ID": [1, 2, 4, 3, 5], + "MARITAL": [ + "Single", + "Married", + "Divorced or Separated", + "Cohabiting", + "Widowed", + ], + } + ) + marital.to_excel(excel_writer=tmp_path / "IXI.xls", sheet_name="Marital Status") + assert ( + assert_series_equal( + _get_mapping(tmp_path, ClinicalDataMapping.MARITAL), + marital.set_index("ID")["MARITAL"], + ) + is None + ) + + +def test_get_ethnic_mapping_success(tmp_path): + ethnic = pd.DataFrame( + { + "ID": [1, 4, 3, 5, 6], + "ETHNIC": [ + "White", + "Black or BlackBritish", + "Asian or Asian British", + "Chinese", + "Other", + ], + } + ) + ethnic.to_excel(excel_writer=tmp_path / "IXI.xls", sheet_name="Ethnicity") + assert ( + assert_series_equal( + _get_mapping(tmp_path, ClinicalDataMapping.ETHNIC), + ethnic.set_index("ID")["ETHNIC"], + ) + is None + ) + + +def test_get_occupation_mapping_success(tmp_path): + occup = pd.DataFrame( + { + "ID": [1, 2, 3, 4, 5, 6, 7, 8], + "OCCUPATION": [ + "Go out to full time employment", + "Go out to part time employment (<25hrs)", + "Study at college or university", + "Full-time housework", + "Retired", + "Unemployed", + "Work for pay at home", + "Other", + ], + } + ) + occup.to_excel(excel_writer=tmp_path / "IXI.xls", sheet_name="Occupation") + assert ( + assert_series_equal( + _get_mapping(tmp_path, ClinicalDataMapping.OCCUPATION), + occup.set_index("ID")["OCCUPATION"], + ) + is None + ) + + +def test_get_qualification_mapping_success(tmp_path): + qualif = pd.DataFrame( + { + "ID": [1, 2, 3, 4, 5], + "QUALIFICATION": [ + "No qualifications", + "O - levels, GCSEs, or CSEs", + "A - levels", + "Further education e.g.City & Guilds / NVQs", + "University or Polytechnic degree", + ], + } + ) + qualif.to_excel(excel_writer=tmp_path / "IXI.xls", sheet_name="Qualification") + assert ( + assert_series_equal( + _get_mapping(tmp_path, ClinicalDataMapping.QUALIFICATION), + qualif.set_index("ID")["QUALIFICATION"], + ) + is None + ) + + +def test_get_mapping_fileerror(tmp_path): + qualif = pd.DataFrame( + { + "ID": [1, 2, 3, 4, 5], + "QUALIFICATION": [ + "No qualifications", + "O - levels, GCSEs, or CSEs", + "A - levels", + "Further education e.g.City & Guilds / NVQs", + "University or Polytechnic degree", + ], + } + ) + qualif.to_excel(excel_writer=tmp_path / "IXI2.xls", sheet_name="Qualification") + + with pytest.raises( + FileNotFoundError, + match=f"Clinical data stored in the folder {tmp_path} is expected to be an excel file named 'IXI.xls'. " + f"In case the file downloaded from the IXI website changed format, please do not hesitate to report to us !", + ): + _get_mapping(tmp_path, ClinicalDataMapping.QUALIFICATION) + + +def test_get_mapping_keyerror(tmp_path): + qualif = pd.DataFrame( + { + "ID": [1, 2, 3, 4, 5], + "QUALIFICATION": [ + "No qualifications", + "O - levels, GCSEs, or CSEs", + "A - levels", + "Further education e.g.City & Guilds / NVQs", + "University or Polytechnic degree", + ], + } + ) + qualif.to_excel(excel_writer=tmp_path / "IXI.xls", sheet_name="Qualification_error") + + with pytest.raises( + ValueError, + match=f"Qualification mapping is expected to be contained in a sheet called Qualification coming from the clinical data excel. " + f"Possibilities are supposed to be described in a QUALIFICATION column associated to keys from the 'ID' column. " + f"In case the file downloaded from the IXI website changed format, please do not hesitate to report to us !", + ): + _get_mapping(tmp_path, ClinicalDataMapping.QUALIFICATION) + + +def test_get_mapping_valueerror(tmp_path): + qualif = pd.DataFrame( + { + "ID": [1, 2, 3, 4, 5], + "QUALIFICATION_error": [ + "No qualifications", + "O - levels, GCSEs, or CSEs", + "A - levels", + "Further education e.g.City & Guilds / NVQs", + "University or Polytechnic degree", + ], + } + ) + qualif.to_excel(excel_writer=tmp_path / "IXI.xls", sheet_name="Qualification") + + with pytest.raises( + ValueError, + match=f"Qualification mapping is expected to be contained in a sheet called Qualification coming from the clinical data excel. " + f"Possibilities are supposed to be described in a QUALIFICATION column associated to keys from the 'ID' column. " + f"In case the file downloaded from the IXI website changed format, please do not hesitate to report to us !", + ): + _get_mapping(tmp_path, ClinicalDataMapping.QUALIFICATION) + + +def test_write_json_image(tmp_path): + _write_json_image(tmp_path / "test.json", hospital="Guys", field="1.5") + with open(tmp_path / "test.json", "r") as f: + data = json.load(f) + assert data["InstitutionName"] == "Guys" + assert data["MagneticFieldStrength (T)"] == "1.5" + + +def test_write_subject_no_dti(tmp_path): + df = image_dataframe_builder(tmp_path) + bids_dir = tmp_path / "BIDS" + file_path = ( + bids_dir + / f"sub-{df['subject'][0]}" + / "ses-M000" + / "anat" + / f"sub-{df['subject'][0]}_ses-M000_T1w" + ) + _write_subject_no_dti(df, bids_dir) + json_files = list(bids_dir.rglob(f"sub-{df['subject'][0]}*.json")) + nii_files = list(bids_dir.rglob(f"sub-{df['subject'][0]}*.nii.gz")) + assert len(json_files) == 1 and json_files[0] == Path(f"{file_path}.json") + assert len(nii_files) == 1 and nii_files[0] == Path(f"{file_path}.nii.gz") + + +def test_write_subject_no_dti_empty(tmp_path): + bids_dir = tmp_path / "BIDS" + bids_dir.mkdir() + _write_subject_no_dti(pd.DataFrame(), bids_dir) + assert not list(bids_dir.iterdir()) + + +def test_find_subject_dti_data(tmp_path): + (tmp_path / "IXI001-Guys-1234-T1.nii.gz").touch() + (tmp_path / "IXI001-Guys-1234-DTI").touch() + (tmp_path / "IXI001-Guys-DTI.nii.gz").touch() + tmp_image = tmp_path / "IXI001-Guys-1234-DTI-00.nii.gz" + tmp_image.touch() + list_dti = _find_subject_dti_data(data_directory=tmp_path, subject="IXI001") + assert len(list_dti) == 1 and list_dti[0] == tmp_image + + +def test_identify_expected_modalities(tmp_path): + (tmp_path / "IXI-DTI").mkdir() + (tmp_path / "IXIdti").mkdir() + (tmp_path / "foo-bar").mkdir() + assert _identify_expected_modalities(tmp_path) == ["DTI"] + + +def test_write_scans_not_empty(tmp_path): + (tmp_path / "sub-IXI001" / "ses-M000" / "anat").mkdir(parents=True) + (tmp_path / "sub-IXI001" / "ses-M000" / "anat" / "sub-IXI001_T1w.nii.gz").touch() + write_scans(tmp_path, participant="IXI001") + tsv_files = list(tmp_path.rglob("*.tsv")) + file_path = tmp_path / "sub-IXI001" / "ses-M000" / "sub-IXI001_ses-M000_scans.tsv" + assert len(tsv_files) == 1 and tsv_files[0] == file_path + assert ( + assert_frame_equal( + pd.read_csv(file_path, sep="\t"), + pd.DataFrame({"filename": ["anat/sub-IXI001_T1w.nii.gz"]}), + ) + is None + ) + + +def formatted_clinical_data_builder() -> pd.DataFrame: + return pd.DataFrame( + { + "source_id": ["IXI001"], + "session_id": ["ses-M000"], + "acq_time": ["2024-08-23"], + "sex": ["female"], + "ethnicity": ["Other"], + "marital status": ["Single"], + "occupation": ["Other"], + "qualification": ["A - levels"], + "date of birth": ["2000-01-01"], + "weight": [80], + } + ) + + +def test_write_sessions(tmp_path): + clinical = formatted_clinical_data_builder() + (tmp_path / "sub-IXI001").mkdir() + write_sessions(tmp_path, clinical, "IXI001") + tsv_files = list(tmp_path.rglob("*.tsv")) + file_path = tmp_path / "sub-IXI001" / "sub-IXI001_sessions.tsv" + assert len(tsv_files) == 1 and tsv_files[0] == file_path + assert ( + assert_frame_equal( + pd.read_csv(file_path, sep="\t"), + clinical[["source_id", "session_id", "acq_time"]], + ) + is None + ) + + +def test_write_participants(tmp_path): + clinical = formatted_clinical_data_builder() + expected = clinical.copy() + write_participants(tmp_path, clinical, ["IXI001", "IXI002"]) + expected.drop(["acq_time", "session_id"], axis=1, inplace=True) + expected = pd.concat( + [expected, pd.DataFrame({col: ["n/a"] for col in expected.columns})] + ).reset_index(drop=True) + expected.loc[1, "source_id"] = "IXI002" + expected.loc[0, "weight"] = str(expected.loc[0, "weight"]) + tsv_files = list(tmp_path.rglob("*.tsv")) + assert len(tsv_files) == 1 and tsv_files[0] == tmp_path / "participants.tsv" + assert ( + assert_frame_equal( + pd.read_csv(tmp_path / "participants.tsv", sep="\t", na_filter=False), + expected, + ) + is None + ) + + +@patch("clinica.iotools.converters.ixi_to_bids.ixi_to_bids_utils.cprint") +def test_check_modalities(mock_cprint, tmp_path): + (tmp_path / "IXI-DTI").mkdir() + (tmp_path / "IXI-DTI" / "IXI001-DTI-00.nii.gz").touch() + (tmp_path / "IXI-T1").mkdir() + (tmp_path / "IXI-T1" / "IXI001-T1.nii.gz").touch() + (tmp_path / "IXI-T1" / "IXI002-T1.nii.gz").touch() + (tmp_path / "IXIT1").mkdir() + + message = ( + f"Modalities : dti , T1w were identified inside {tmp_path} for conversion.\n" + f"Some subjects do not have data for the following modalities :\n" + f"IXI002 : dti\n" + ) + + check_modalities(tmp_path, ["IXI001", "IXI002"]) + mock_cprint.assert_called_once_with(message) diff --git a/test/unittests/iotools/test_bids_utils.py b/test/unittests/iotools/test_bids_utils.py index 3135090fd..6fe0ca75d 100644 --- a/test/unittests/iotools/test_bids_utils.py +++ b/test/unittests/iotools/test_bids_utils.py @@ -52,6 +52,7 @@ (StudyName.OASIS3, "OAS30001", "sub-OAS30001"), (StudyName.HABS, "P_INIBUB", "sub-HABSINIBUB"), (StudyName.OASIS, "OAS1_0001_MR1", "sub-OASIS10001"), + (StudyName.IXI, "IXI001", "sub-IXI001"), ], ) def test_study_to_bids_id_passing(study, study_id, expected): @@ -60,6 +61,58 @@ def test_study_to_bids_id_passing(study, study_id, expected): assert bids_id_factory(study).from_original_study_id(study_id) == expected +@pytest.mark.parametrize( + "study,study_id", + [ + (StudyName.ADNI, "001S0001"), + (StudyName.ADNI, "001_X_0001"), + (StudyName.ADNI, "foo_S_0001"), + (StudyName.NIFD, "1S0001"), + (StudyName.NIFD, "1_X_0001"), + (StudyName.NIFD, "foo_S_0001"), + (StudyName.AIBL, "10A"), + (StudyName.UKB, "0101001A"), + (StudyName.GENFI, "MAPT009?"), + (StudyName.OASIS3, "OAS3_0001"), + (StudyName.OASIS3, "OAS3001"), + (StudyName.HABS, "PINIBUB"), + (StudyName.HABS, "X_INIBUB"), + (StudyName.HABS, "P_INIBUB?"), + (StudyName.OASIS, "OAS10001MR1"), + (StudyName.OASIS, "OAS1_0001_MRI1"), + (StudyName.OASIS, "OAS1_001_MR1"), + (StudyName.IXI, "IXI_001"), + (StudyName.IXI, "IXI0001"), + ], +) +def test_study_to_bids_id_value_error(study, study_id): + from clinica.iotools.bids_utils import bids_id_factory + + with pytest.raises(ValueError): + bids_id_factory(study).from_original_study_id(study_id) + + +@pytest.mark.parametrize( + "study,expected,bids_id", + [ + (StudyName.ADNI, "001_S_0001", "sub-ADNI001S0001"), + (StudyName.NIFD, "1_S_0001", "sub-NIFD1S0001"), + (StudyName.AIBL, "10", "sub-AIBL10"), + (StudyName.UKB, "0101001", "sub-UKB0101001"), + (StudyName.GENFI, "MAPT009", "sub-MAPT009"), + (StudyName.OASIS3, "OAS30001", "sub-OAS30001"), + (StudyName.HABS, "P_INIBUB", "sub-HABSINIBUB"), + # (StudyName.OASIS, "OAS1_0001_MR1", "sub-OASIS10001"), + # todo : check OASIS + (StudyName.IXI, "IXI001", "sub-IXI001"), + ], +) +def test_bids_to_study(study, bids_id, expected): + from clinica.iotools.bids_utils import bids_id_factory + + assert bids_id_factory(study).to_original_study_id(bids_id) == expected + + def create_clinical_data(tmp_path: Path, study_name: StudyName) -> Path: spec_df = pd.DataFrame( {