Skip to content

Commit

Permalink
[TEST][REF] Move functions specific to OASIS1 converter in a dedicate…
Browse files Browse the repository at this point in the history
…d utils file and add unit tests (#1313)

* Move functions from bids_utils to oasis_to_bids_utils + create test script

* WIP 1

* Remove unused optional argument subj_to_remove

* WIP2

* Finish tests

* Fix

* Add last test

* Changes upon suggestions

* Changes 2
  • Loading branch information
AliceJoubert authored Oct 9, 2024
1 parent 9ed92e5 commit a7fa12e
Show file tree
Hide file tree
Showing 4 changed files with 349 additions and 193 deletions.
186 changes: 1 addition & 185 deletions clinica/iotools/bids_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,146 +464,6 @@ def create_participants_df(
return participant_df


def create_sessions_dict_oasis(
clinical_data_dir: Path,
bids_dir: Path,
study_name: StudyName,
clinical_specifications_folder: Path,
bids_ids: list[str],
name_column_ids: str,
subj_to_remove: Optional[list[str]] = None,
participants_df: Optional[pd.DataFrame] = None,
) -> dict:
"""Extract the information regarding the sessions and store them in a dictionary (session M00 only).
Parameters
----------
clinical_data_dir : Path
The path to the input folder.
bids_dir : Path
The path to the BIDS directory.
study_name : StudyName
The name of the study (Ex: ADNI).
clinical_specifications_folder : Path
The path to the clinical file.
bids_ids : list of str
The list of bids ids.
name_column_ids : str
The name of the column where the subject ids are stored.
subj_to_remove : list of str, optional
The list of subject IDs to remove.
participants_df : pd.DataFrame, optional
A pandas dataframe that contains the participants data (required for OASIS3 only).
Returns
-------
dict :
Session dict.
"""
import numpy as np

from clinica.utils.stream import cprint

subj_to_remove = subj_to_remove or []
location = f"{study_name.value} location"
sessions = pd.read_csv(clinical_specifications_folder / "sessions.tsv", sep="\t")
sessions_fields = sessions[study_name.value]
field_location = sessions[location]
sessions_fields_bids = sessions["BIDS CLINICA"]
fields_dataset = []
fields_bids = []
sessions_dict = {}

for i in range(0, len(sessions_fields)):
if not pd.isnull(sessions_fields[i]):
fields_bids.append(sessions_fields_bids[i])
fields_dataset.append(sessions_fields[i])

for i in range(0, len(sessions_fields)):
# If the i-th field is available
if not pd.isnull(sessions_fields[i]):
# Load the file
tmp = field_location[i].split("/")
location = tmp[0]
if len(tmp) > 1:
sheet = tmp[1]
else:
sheet = ""

file_to_read_path = clinical_data_dir / location
file_ext = os.path.splitext(location)[1]
if file_ext == ".xlsx":
file_to_read = pd.read_excel(file_to_read_path, sheet_name=sheet)
elif file_ext == ".csv":
file_to_read = pd.read_csv(file_to_read_path)
else:
raise ValueError(
f"Unknown file extension {file_ext}. Expecting either .xlsx or .csv."
)

for r in range(0, len(file_to_read.values)):
# Extracts the subject ids columns from the dataframe
subj_id = file_to_read.iloc[r][name_column_ids]
if hasattr(subj_id, "dtype"):
if subj_id.dtype == np.int64:
subj_id = str(subj_id)
# Removes all the - from
subj_id_alpha = str(subj_id[0:3] + "IS" + subj_id[3] + subj_id[5:9])

# Extract the corresponding BIDS id and create the output file if doesn't exist
subj_bids = [s for s in bids_ids if subj_id_alpha in s]
if len(subj_bids) == 0:
# If the subject is not an excluded one
if subj_id not in subj_to_remove:
cprint(
f"{sessions_fields[i]} for {subj_id} not found in the BIDS converted.",
"info",
)
else:
subj_bids = subj_bids[0]
subj_dir = bids_dir / subj_bids
session_names = get_bids_sess_list(subj_dir)
for s in session_names:
s_name = s.replace("ses-", "")
if study_name == StudyName.OASIS3:
row = file_to_read[
file_to_read["MR ID"].str.startswith(subj_id)
& file_to_read["MR ID"].str.endswith(s_name)
].iloc[0]
else:
row = file_to_read.iloc[r]
if subj_bids not in sessions_dict:
sessions_dict.update({subj_bids: {}})
if s_name not in sessions_dict[subj_bids].keys():
sessions_dict[subj_bids].update({s_name: {"session_id": s}})
(sessions_dict[subj_bids][s_name]).update(
{sessions_fields_bids[i]: row[sessions_fields[i]]}
)
# Calculate the difference in months for OASIS3 only
if (
study_name == StudyName.OASIS3
and sessions_fields_bids[i] == "age"
):
diff_years = (
float(sessions_dict[subj_bids][s_name]["age"])
- participants_df[
participants_df["participant_id"] == subj_bids
]["age_bl"]
)
(sessions_dict[subj_bids][s_name]).update(
{"diff_months": round(float(diff_years) * 12)}
)

return sessions_dict


def create_scans_dict(
clinical_data_dir: Path,
study_name: StudyName,
Expand Down Expand Up @@ -836,51 +696,6 @@ def write_modality_agnostic_files(
_write_bidsignore(bids_dir)


# todo : move to oasis utils ?
def write_sessions_tsv(bids_dir: Path, sessions_dict: dict) -> None:
"""Create <participant_id>_sessions.tsv files.
Basically writes the content of the function
`clinica.iotools.bids_utils.create_sessions_dict` in several TSV files
following the BIDS specification.
Parameters
----------
bids_dir : Path
The path to the BIDS directory.
sessions_dict : dict
Dictionary containing sessions metadata.
.. note::
This is the output of the function
`clinica.iotools.bids_utils.create_sessions_dict`.
See also
--------
create_sessions_dict
write_scans_tsv
"""
for subject_path in bids_dir.glob("sub-*"):
if subject_path.name in sessions_dict:
session_df = pd.DataFrame.from_dict(
sessions_dict[subject_path.name], orient="index"
)
cols = session_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
session_df = session_df[cols]
else:
print(f"No session data available for {subject_path}")
session_df = pd.DataFrame(columns=["session_id"])
session_df["session_id"] = pd.Series("M000")
session_df = session_df.set_index("session_id").fillna("n/a")
session_df.to_csv(
subject_path / f"{subject_path.name}_sessions.tsv",
sep="\t",
encoding="utf8",
)


def _get_pet_tracer_from_filename(filename: str) -> Tracer:
"""Return the PET tracer from the provided filename.
Expand Down Expand Up @@ -1227,6 +1042,7 @@ def identify_modality(filename: str) -> Optional[str]:
return np.nan


# todo : use more ?
def write_to_tsv(df: pd.DataFrame, buffer: Union[Path, BinaryIO]) -> None:
"""Save dataframe as a BIDS-compliant TSV file.
Expand Down
12 changes: 4 additions & 8 deletions clinica/iotools/converters/oasis_to_bids/oasis_to_bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,20 +108,16 @@ def _create_sessions_tsv(
bids_dir: Path,
bids_ids: list[str],
) -> dict:
from clinica.iotools.bids_utils import (
StudyName,
create_sessions_dict_oasis,
write_sessions_tsv,
)
from .oasis_to_bids_utils import create_sessions_dict, write_sessions_tsv

sessions_dict = create_sessions_dict_oasis(
sessions_dict = create_sessions_dict(
clinical_data_dir=clinical_data_dir,
bids_dir=bids_dir,
study_name=StudyName.OASIS,
clinical_specifications_folder=Path(__file__).parents[1] / "specifications",
bids_ids=bids_ids,
name_column_ids="ID",
)

# todo : when tested add to create_sessions_dict bc specific to oasis1
for bids_id in bids_ids:
sessions_dict[bids_id]["M000"]["diagnosis"] = (
"AD" if sessions_dict[bids_id]["M000"]["diagnosis"] > 0 else "CN"
Expand Down
148 changes: 148 additions & 0 deletions clinica/iotools/converters/oasis_to_bids/oasis_to_bids_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import os
from pathlib import Path
from typing import Iterable

import numpy as np
import pandas as pd

from clinica.iotools.bids_utils import StudyName, get_bids_sess_list
from clinica.utils.stream import cprint

__all__ = ["create_sessions_dict", "write_sessions_tsv"]


def create_sessions_dict(
clinical_data_dir: Path,
bids_dir: Path,
clinical_specifications_folder: Path,
bids_ids: Iterable[str],
) -> dict:
"""Extract the information regarding the sessions and store them in a dictionary (session M000 only).
Parameters
----------
clinical_data_dir : Path
The path to the input folder.
bids_dir : Path
The path to the BIDS directory.
clinical_specifications_folder : Path
The path to the clinical file.
bids_ids : list of str
The list of bids ids.
Returns
-------
dict :
Session dict.
"""

location = f"{StudyName.OASIS.value} location"
sessions = pd.read_csv(clinical_specifications_folder / "sessions.tsv", sep="\t")
sessions_fields = sessions[StudyName.OASIS.value]
field_location = sessions[location]
sessions_fields_bids = sessions["BIDS CLINICA"]
fields_dataset = []
fields_bids = []
sessions_dict = {}

for i in range(0, len(sessions_fields)):
if not pd.isnull(sessions_fields[i]):
fields_bids.append(sessions_fields_bids[i])
fields_dataset.append(sessions_fields[i])

for i in range(0, len(sessions_fields)):
# If the i-th field is available
if not pd.isnull(sessions_fields[i]):
# Load the file
tmp = field_location[i].split("/")
location = tmp[0]
if len(tmp) > 1:
sheet = tmp[1]
else:
sheet = ""

file_to_read_path = clinical_data_dir / location
file_ext = os.path.splitext(location)[1]
if file_ext == ".xlsx":
file_to_read = pd.read_excel(file_to_read_path, sheet_name=sheet)
elif file_ext == ".csv":
file_to_read = pd.read_csv(file_to_read_path)
else:
raise ValueError(
f"Unknown file extension {file_ext}. Expecting either .xlsx or .csv."
)

for r in range(0, len(file_to_read.values)):
# Extracts the subject ids columns from the dataframe
subj_id = file_to_read.iloc[r]["ID"]
if hasattr(subj_id, "dtype"):
if subj_id.dtype == np.int64:
subj_id = str(subj_id)
# Removes all the - from
subj_id_alpha = str(subj_id[0:3] + "IS" + subj_id[3] + subj_id[5:9])

# Extract the corresponding BIDS id and create the output file if doesn't exist
subj_bids = [s for s in bids_ids if subj_id_alpha in s]
if subj_bids:
subj_bids = subj_bids[0]
subj_dir = bids_dir / subj_bids
session_names = get_bids_sess_list(subj_dir)
for s in session_names:
s_name = s.replace("ses-", "")
row = file_to_read.iloc[r]
if subj_bids not in sessions_dict:
sessions_dict.update({subj_bids: {}})
if s_name not in sessions_dict[subj_bids].keys():
sessions_dict[subj_bids].update({s_name: {"session_id": s}})
(sessions_dict[subj_bids][s_name]).update(
{sessions_fields_bids[i]: row[sessions_fields[i]]}
)

return sessions_dict


def write_sessions_tsv(bids_dir: Path, sessions_dict: dict) -> None:
"""Create <participant_id>_sessions.tsv files.
Basically writes the content of the function
`clinica.iotools.bids_utils.create_sessions_dict` in several TSV files
following the BIDS specification.
Parameters
----------
bids_dir : Path
The path to the BIDS directory.
sessions_dict : dict
Dictionary containing sessions metadata.
.. note::
This is the output of the function
`clinica.iotools.bids_utils.create_sessions_dict`.
See also
--------
create_sessions_dict
write_scans_tsv
"""
for subject_path in bids_dir.glob("sub-*"):
if subject_path.name in sessions_dict:
session_df = pd.DataFrame.from_dict(
sessions_dict[subject_path.name], orient="index"
)
cols = session_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
session_df = session_df[cols]
else:
print(f"No session data available for {subject_path}")
session_df = pd.DataFrame(columns=["session_id"])
session_df["session_id"] = pd.Series("M000")
session_df = session_df.set_index("session_id").fillna("n/a")
session_df.to_csv(
subject_path / f"{subject_path.name}_sessions.tsv",
sep="\t",
encoding="utf8",
)
Loading

0 comments on commit a7fa12e

Please sign in to comment.