[TEST][REF] Move functions specific to OASIS1 converter in a dedicate…

…d utils file and add unit tests (#1313) * Move functions from bids_utils to oasis_to_bids_utils + create test script * WIP 1 * Remove unused optional argument subj_to_remove * WIP2 * Finish tests * Fix * Add last test * Changes upon suggestions * Changes 2
aramis-lab · Oct 9, 2024 · a7fa12e · a7fa12e
1 parent 9ed92e5
commit a7fa12e
Show file tree

Hide file tree

Showing 4 changed files with 349 additions and 193 deletions.
diff --git a/clinica/iotools/bids_utils.py b/clinica/iotools/bids_utils.py
@@ -464,146 +464,6 @@ def create_participants_df(
     return participant_df
 
 
-def create_sessions_dict_oasis(
-    clinical_data_dir: Path,
-    bids_dir: Path,
-    study_name: StudyName,
-    clinical_specifications_folder: Path,
-    bids_ids: list[str],
-    name_column_ids: str,
-    subj_to_remove: Optional[list[str]] = None,
-    participants_df: Optional[pd.DataFrame] = None,
-) -> dict:
-    """Extract the information regarding the sessions and store them in a dictionary (session M00 only).
-
-    Parameters
-    ----------
-    clinical_data_dir : Path
-        The path to the input folder.
-
-    bids_dir : Path
-        The path to the BIDS directory.
-
-    study_name : StudyName
-        The name of the study (Ex: ADNI).
-
-    clinical_specifications_folder : Path
-        The path to the clinical file.
-
-    bids_ids : list of str
-        The list of bids ids.
-
-    name_column_ids : str
-        The name of the column where the subject ids are stored.
-
-    subj_to_remove : list of str, optional
-        The list of subject IDs to remove.
-
-    participants_df : pd.DataFrame, optional
-        A pandas dataframe that contains the participants data (required for OASIS3 only).
-
-    Returns
-    -------
-    dict :
-        Session dict.
-    """
-    import numpy as np
-
-    from clinica.utils.stream import cprint
-
-    subj_to_remove = subj_to_remove or []
-    location = f"{study_name.value} location"
-    sessions = pd.read_csv(clinical_specifications_folder / "sessions.tsv", sep="\t")
-    sessions_fields = sessions[study_name.value]
-    field_location = sessions[location]
-    sessions_fields_bids = sessions["BIDS CLINICA"]
-    fields_dataset = []
-    fields_bids = []
-    sessions_dict = {}
-
-    for i in range(0, len(sessions_fields)):
-        if not pd.isnull(sessions_fields[i]):
-            fields_bids.append(sessions_fields_bids[i])
-            fields_dataset.append(sessions_fields[i])
-
-    for i in range(0, len(sessions_fields)):
-        # If the i-th field is available
-        if not pd.isnull(sessions_fields[i]):
-            # Load the file
-            tmp = field_location[i].split("/")
-            location = tmp[0]
-            if len(tmp) > 1:
-                sheet = tmp[1]
-            else:
-                sheet = ""
-
-            file_to_read_path = clinical_data_dir / location
-            file_ext = os.path.splitext(location)[1]
-            if file_ext == ".xlsx":
-                file_to_read = pd.read_excel(file_to_read_path, sheet_name=sheet)
-            elif file_ext == ".csv":
-                file_to_read = pd.read_csv(file_to_read_path)
-            else:
-                raise ValueError(
-                    f"Unknown file extension {file_ext}. Expecting either .xlsx or .csv."
-                )
-
-            for r in range(0, len(file_to_read.values)):
-                # Extracts the subject ids columns from the dataframe
-                subj_id = file_to_read.iloc[r][name_column_ids]
-                if hasattr(subj_id, "dtype"):
-                    if subj_id.dtype == np.int64:
-                        subj_id = str(subj_id)
-                # Removes all the - from
-                subj_id_alpha = str(subj_id[0:3] + "IS" + subj_id[3] + subj_id[5:9])
-
-                # Extract the corresponding BIDS id and create the output file if doesn't exist
-                subj_bids = [s for s in bids_ids if subj_id_alpha in s]
-                if len(subj_bids) == 0:
-                    # If the subject is not an excluded one
-                    if subj_id not in subj_to_remove:
-                        cprint(
-                            f"{sessions_fields[i]} for {subj_id} not found in the BIDS converted.",
-                            "info",
-                        )
-                else:
-                    subj_bids = subj_bids[0]
-                    subj_dir = bids_dir / subj_bids
-                    session_names = get_bids_sess_list(subj_dir)
-                    for s in session_names:
-                        s_name = s.replace("ses-", "")
-                        if study_name == StudyName.OASIS3:
-                            row = file_to_read[
-                                file_to_read["MR ID"].str.startswith(subj_id)
-                                & file_to_read["MR ID"].str.endswith(s_name)
-                            ].iloc[0]
-                        else:
-                            row = file_to_read.iloc[r]
-                        if subj_bids not in sessions_dict:
-                            sessions_dict.update({subj_bids: {}})
-                        if s_name not in sessions_dict[subj_bids].keys():
-                            sessions_dict[subj_bids].update({s_name: {"session_id": s}})
-                        (sessions_dict[subj_bids][s_name]).update(
-                            {sessions_fields_bids[i]: row[sessions_fields[i]]}
-                        )
-                        # Calculate the difference in months for OASIS3 only
-                        if (
-                            study_name == StudyName.OASIS3
-                            and sessions_fields_bids[i] == "age"
-                        ):
-                            diff_years = (
-                                float(sessions_dict[subj_bids][s_name]["age"])
-                                - participants_df[
-                                    participants_df["participant_id"] == subj_bids
-                                ]["age_bl"]
-                            )
-                            (sessions_dict[subj_bids][s_name]).update(
-                                {"diff_months": round(float(diff_years) * 12)}
-                            )
-
-    return sessions_dict
-
-
 def create_scans_dict(
     clinical_data_dir: Path,
     study_name: StudyName,
@@ -836,51 +696,6 @@ def write_modality_agnostic_files(
     _write_bidsignore(bids_dir)
 
 
-# todo : move to oasis utils ?
-def write_sessions_tsv(bids_dir: Path, sessions_dict: dict) -> None:
-    """Create <participant_id>_sessions.tsv files.
-
-    Basically writes the content of the function
-    `clinica.iotools.bids_utils.create_sessions_dict` in several TSV files
-    following the BIDS specification.
-
-    Parameters
-    ----------
-    bids_dir : Path
-        The path to the BIDS directory.
-
-    sessions_dict : dict
-        Dictionary containing sessions metadata.
-
-        .. note::
-            This is the output of the function
-            `clinica.iotools.bids_utils.create_sessions_dict`.
-
-    See also
-    --------
-    create_sessions_dict
-    write_scans_tsv
-    """
-    for subject_path in bids_dir.glob("sub-*"):
-        if subject_path.name in sessions_dict:
-            session_df = pd.DataFrame.from_dict(
-                sessions_dict[subject_path.name], orient="index"
-            )
-            cols = session_df.columns.tolist()
-            cols = cols[-1:] + cols[:-1]
-            session_df = session_df[cols]
-        else:
-            print(f"No session data available for {subject_path}")
-            session_df = pd.DataFrame(columns=["session_id"])
-            session_df["session_id"] = pd.Series("M000")
-        session_df = session_df.set_index("session_id").fillna("n/a")
-        session_df.to_csv(
-            subject_path / f"{subject_path.name}_sessions.tsv",
-            sep="\t",
-            encoding="utf8",
-        )
-
-
 def _get_pet_tracer_from_filename(filename: str) -> Tracer:
     """Return the PET tracer from the provided filename.
 
@@ -1227,6 +1042,7 @@ def identify_modality(filename: str) -> Optional[str]:
         return np.nan
 
 
+# todo : use more ?
 def write_to_tsv(df: pd.DataFrame, buffer: Union[Path, BinaryIO]) -> None:
     """Save dataframe as a BIDS-compliant TSV file.
 

diff --git a/clinica/iotools/converters/oasis_to_bids/oasis_to_bids.py b/clinica/iotools/converters/oasis_to_bids/oasis_to_bids.py
@@ -108,20 +108,16 @@ def _create_sessions_tsv(
         bids_dir: Path,
         bids_ids: list[str],
     ) -> dict:
-        from clinica.iotools.bids_utils import (
-            StudyName,
-            create_sessions_dict_oasis,
-            write_sessions_tsv,
-        )
+        from .oasis_to_bids_utils import create_sessions_dict, write_sessions_tsv
 
-        sessions_dict = create_sessions_dict_oasis(
+        sessions_dict = create_sessions_dict(
             clinical_data_dir=clinical_data_dir,
             bids_dir=bids_dir,
-            study_name=StudyName.OASIS,
             clinical_specifications_folder=Path(__file__).parents[1] / "specifications",
             bids_ids=bids_ids,
-            name_column_ids="ID",
         )
+
+        # todo : when tested add to create_sessions_dict bc specific to oasis1
         for bids_id in bids_ids:
             sessions_dict[bids_id]["M000"]["diagnosis"] = (
                 "AD" if sessions_dict[bids_id]["M000"]["diagnosis"] > 0 else "CN"

diff --git a/clinica/iotools/converters/oasis_to_bids/oasis_to_bids_utils.py b/clinica/iotools/converters/oasis_to_bids/oasis_to_bids_utils.py
@@ -0,0 +1,148 @@
+import os
+from pathlib import Path
+from typing import Iterable
+
+import numpy as np
+import pandas as pd
+
+from clinica.iotools.bids_utils import StudyName, get_bids_sess_list
+from clinica.utils.stream import cprint
+
+__all__ = ["create_sessions_dict", "write_sessions_tsv"]
+
+
+def create_sessions_dict(
+    clinical_data_dir: Path,
+    bids_dir: Path,
+    clinical_specifications_folder: Path,
+    bids_ids: Iterable[str],
+) -> dict:
+    """Extract the information regarding the sessions and store them in a dictionary (session M000 only).
+
+    Parameters
+    ----------
+    clinical_data_dir : Path
+        The path to the input folder.
+
+    bids_dir : Path
+        The path to the BIDS directory.
+
+    clinical_specifications_folder : Path
+        The path to the clinical file.
+
+    bids_ids : list of str
+        The list of bids ids.
+
+    Returns
+    -------
+    dict :
+        Session dict.
+    """
+
+    location = f"{StudyName.OASIS.value} location"
+    sessions = pd.read_csv(clinical_specifications_folder / "sessions.tsv", sep="\t")
+    sessions_fields = sessions[StudyName.OASIS.value]
+    field_location = sessions[location]
+    sessions_fields_bids = sessions["BIDS CLINICA"]
+    fields_dataset = []
+    fields_bids = []
+    sessions_dict = {}
+
+    for i in range(0, len(sessions_fields)):
+        if not pd.isnull(sessions_fields[i]):
+            fields_bids.append(sessions_fields_bids[i])
+            fields_dataset.append(sessions_fields[i])
+
+    for i in range(0, len(sessions_fields)):
+        # If the i-th field is available
+        if not pd.isnull(sessions_fields[i]):
+            # Load the file
+            tmp = field_location[i].split("/")
+            location = tmp[0]
+            if len(tmp) > 1:
+                sheet = tmp[1]
+            else:
+                sheet = ""
+
+            file_to_read_path = clinical_data_dir / location
+            file_ext = os.path.splitext(location)[1]
+            if file_ext == ".xlsx":
+                file_to_read = pd.read_excel(file_to_read_path, sheet_name=sheet)
+            elif file_ext == ".csv":
+                file_to_read = pd.read_csv(file_to_read_path)
+            else:
+                raise ValueError(
+                    f"Unknown file extension {file_ext}. Expecting either .xlsx or .csv."
+                )
+
+            for r in range(0, len(file_to_read.values)):
+                # Extracts the subject ids columns from the dataframe
+                subj_id = file_to_read.iloc[r]["ID"]
+                if hasattr(subj_id, "dtype"):
+                    if subj_id.dtype == np.int64:
+                        subj_id = str(subj_id)
+                # Removes all the - from
+                subj_id_alpha = str(subj_id[0:3] + "IS" + subj_id[3] + subj_id[5:9])
+
+                # Extract the corresponding BIDS id and create the output file if doesn't exist
+                subj_bids = [s for s in bids_ids if subj_id_alpha in s]
+                if subj_bids:
+                    subj_bids = subj_bids[0]
+                    subj_dir = bids_dir / subj_bids
+                    session_names = get_bids_sess_list(subj_dir)
+                    for s in session_names:
+                        s_name = s.replace("ses-", "")
+                        row = file_to_read.iloc[r]
+                        if subj_bids not in sessions_dict:
+                            sessions_dict.update({subj_bids: {}})
+                        if s_name not in sessions_dict[subj_bids].keys():
+                            sessions_dict[subj_bids].update({s_name: {"session_id": s}})
+                        (sessions_dict[subj_bids][s_name]).update(
+                            {sessions_fields_bids[i]: row[sessions_fields[i]]}
+                        )
+
+    return sessions_dict
+
+
+def write_sessions_tsv(bids_dir: Path, sessions_dict: dict) -> None:
+    """Create <participant_id>_sessions.tsv files.
+
+    Basically writes the content of the function
+    `clinica.iotools.bids_utils.create_sessions_dict` in several TSV files
+    following the BIDS specification.
+
+    Parameters
+    ----------
+    bids_dir : Path
+        The path to the BIDS directory.
+
+    sessions_dict : dict
+        Dictionary containing sessions metadata.
+
+        .. note::
+            This is the output of the function
+            `clinica.iotools.bids_utils.create_sessions_dict`.
+
+    See also
+    --------
+    create_sessions_dict
+    write_scans_tsv
+    """
+    for subject_path in bids_dir.glob("sub-*"):
+        if subject_path.name in sessions_dict:
+            session_df = pd.DataFrame.from_dict(
+                sessions_dict[subject_path.name], orient="index"
+            )
+            cols = session_df.columns.tolist()
+            cols = cols[-1:] + cols[:-1]
+            session_df = session_df[cols]
+        else:
+            print(f"No session data available for {subject_path}")
+            session_df = pd.DataFrame(columns=["session_id"])
+            session_df["session_id"] = pd.Series("M000")
+        session_df = session_df.set_index("session_id").fillna("n/a")
+        session_df.to_csv(
+            subject_path / f"{subject_path.name}_sessions.tsv",
+            sep="\t",
+            encoding="utf8",
+        )