From 09fc59c2840dec5856ca6414974d428bfdbf4e66 Mon Sep 17 00:00:00 2001 From: AliceJoubert <158147135+AliceJoubert@users.noreply.github.com> Date: Fri, 25 Oct 2024 08:52:11 +0200 Subject: [PATCH] [TEST] AIBL-to-BIDS : add unit tests to functions related to creating sessions.tsv files (#1347) * wip1 * wip2 * Add tests for load_spec * Ref/add test compute_exam_date_from_bl * End of first proposition * Small fix --- .../converters/aibl_to_bids/utils/clinical.py | 112 ++++++--- .../aibl_to_bids/test_aibl_utils.py | 226 ++++++++++++++++++ 2 files changed, 310 insertions(+), 28 deletions(-) diff --git a/clinica/iotools/converters/aibl_to_bids/utils/clinical.py b/clinica/iotools/converters/aibl_to_bids/utils/clinical.py index 9411b23b1..61540925e 100644 --- a/clinica/iotools/converters/aibl_to_bids/utils/clinical.py +++ b/clinica/iotools/converters/aibl_to_bids/utils/clinical.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import List, Optional +from typing import List, Optional, Union import pandas as pd @@ -153,12 +153,12 @@ def create_sessions_tsv_file( clinical_data_dir: Path, clinical_specifications_folder: Path, ) -> None: - """Extract the information regarding the sessions and save them in a tsv file. + """Extract the information regarding a subject sessions and save them in a tsv file. Parameters ---------- input_path : Path - The path to the input folder. + The path to the input folder (BIDS directory). clinical_data_dir : Path The path to the directory to the clinical data files. @@ -211,7 +211,9 @@ def create_sessions_tsv_file( elif field in list(df.columns.values) and field == "CDGLOBAL": cd_global = df.loc[(df["RID"] == rid), field] - cd_global[cd_global == -4] = "n/a" + cd_global[ + cd_global == -4 + ] = "n/a" # todo (LATER) : do that mapping later, same for other fields elif field in list(df.columns.values) and field == "DXCURREN": dx_curren = df.loc[(df["RID"] == rid), field] @@ -229,7 +231,13 @@ def create_sessions_tsv_file( exam_dates = _clean_exam_dates( rid, exam_date.to_list(), visit_code.to_list(), clinical_data_dir ) - age = _compute_ages_at_each_exam(patient_date_of_birth.values[0], exam_dates) + + if not patient_date_of_birth.empty: + age = _compute_ages_at_each_exam( + patient_date_of_birth.values[0], exam_dates + ) + else: + age = "n/a" visit_code[visit_code == "bl"] = "M000" visit_code = visit_code.str.upper() @@ -244,6 +252,7 @@ def create_sessions_tsv_file( "examination_date": exam_dates, } ) + # todo (LATER) : pretty sure there exists a function that converts viscode to session sessions = sessions.assign( session_id=lambda df: df.months.apply(lambda x: f"ses-M{int(x):03d}") ) @@ -263,9 +272,29 @@ def create_sessions_tsv_file( def _clean_exam_dates( - rid: str, exam_dates: List[str], visit_codes: List[str], clinical_data_dir: Path -) -> List[str]: - """Clean the exam dates when necessary by trying to compute them from other sources.""" + rid: int, + exam_dates: List[Union[str, int]], + visit_codes: List[str], + clinical_data_dir: Path, +) -> List[Union[str, int]]: + """Clean the exam dates when necessary by trying to compute them from other sources. + + Parameters + ---------- + rid : int + Patient study/source id + exam_dates : List + Ex : ['10/12/2007', '05/29/2009', '10/25/2010', -4] + visit_codes : List + Ex : ['bl', 'm18', 'm36', 'm54'] + clinical_data_dir : Path + + Returns + ------- + List of cleaned exam dates + + """ + from clinica.utils.stream import cprint exam_dates_cleaned: List[str] = [] @@ -275,7 +304,10 @@ def _clean_exam_dates( rid, visit_code, clinical_data_dir ) or _compute_exam_date_from_baseline(visit_code, exam_dates, visit_codes) if not exam_date: - cprint(f"No EXAMDATE for subject %{rid}, at session {visit_code}") + cprint( + f"No EXAMDATE for subject %{rid}, at session {visit_code}", + lvl="debug", + ) exam_date = "-4" exam_dates_cleaned.append(exam_date) @@ -283,10 +315,11 @@ def _clean_exam_dates( def _find_exam_date_in_other_csv_files( - rid: str, visit_code: str, clinical_data_dir: Path + rid: int, visit_code: str, clinical_data_dir: Path ) -> Optional[str]: """Try to find an alternative exam date by searching in other CSV files.""" - for csv_file in _get_cvs_files(clinical_data_dir): + # todo (LATER) : refactor and test depending on _get_csv_files + for csv_file in _get_csv_files(clinical_data_dir): if "aibl_flutemeta" in csv_file: csv_data = pd.read_csv( csv_file, low_memory=False, usecols=list(range(0, 36)) @@ -299,9 +332,11 @@ def _find_exam_date_in_other_csv_files( return None -def _get_cvs_files(clinical_data_dir: Path) -> List[str]: +def _get_csv_files(clinical_data_dir: Path) -> List[str]: """Return a list of paths to CSV files in which an alternative exam date could be found.""" import glob + # todo (LATER) : would be better to use a function similar to load_clinical_csv from ADNI + # bc there it does not check for existence and can return anything return [ glob.glob(str(clinical_data_dir / pattern))[0] @@ -317,26 +352,41 @@ def _get_cvs_files(clinical_data_dir: Path) -> List[str]: def _compute_exam_date_from_baseline( - visit_code: str, exam_dates: List[str], visit_codes: List[str] + visit_code: str, exam_dates: List[Union[str, int]], visit_codes: List[str] ) -> Optional[str]: - """Try to find an alternative exam date by computing the number of months from the visit code.""" + """Try to find an alternative exam date by computing the number of months from the visit code. + + Parameters + ---------- + visit_code : Visit code of the current analysed session + exam_dates : List of all the exam_dates for one subject (ex : ['01/01/2000', -4]) + visit_codes : List of all the visit codes for one subject (ex : ['bl', 'm12'] + + Returns + ------- + Either None or the calculated date (str) + """ + # todo (LATER) : this function could use a refactor though, + # for the same output you could just use the visitcode and the date at baseline (no need to use the lists) + # assuming you know it. There it returns none if its baseline ? why not the baseline date ? + + import re from datetime import datetime from dateutil.relativedelta import relativedelta - baseline_index = visit_codes.index("bl") - if baseline_index > -1: + if visit_code != "bl": + try: + months = int(re.match(r"m(\d*)", visit_code).group(1)) + except AttributeError: + raise ValueError( + f"Unexpected visit code {visit_code}. Should be in format mX :" + "Ex: m0, m6, m12, m048..." + ) + baseline_index = visit_codes.index("bl") baseline_date = datetime.strptime(exam_dates[baseline_index], "%m/%d/%Y") - if visit_code != "bl": - try: - months = int(visit_code[1:]) - except TypeError: - raise ValueError( - f"Unexpected visit code {visit_code}. Should be in format MXXX." - "Ex: M000, M006, M048..." - ) - exam_date = baseline_date + relativedelta(months=+months) - return exam_date.strftime("%m/%d/%Y") + exam_date = baseline_date + relativedelta(months=+months) + return exam_date.strftime("%m/%d/%Y") return None @@ -365,8 +415,14 @@ def _compute_ages_at_each_exam( for exam_date in exam_dates: exam_date = datetime.strptime(exam_date, "%m/%d/%Y") - delta = exam_date - date_of_birth - ages.append(round(delta.days / 365.25, 1)) + delta = exam_date.year - date_of_birth.year + ages.append(delta) + + # todo (NOW) : + # rq : what is the use of being so precise ? we are comparing a year with a full date.. that's false anyway + # we could give ages in years (int, >=0) and just subtract the years + + # todo (LATER) : what happens if wrong format ? or exam < birth for some reason ? return ages diff --git a/test/unittests/iotools/converters/aibl_to_bids/test_aibl_utils.py b/test/unittests/iotools/converters/aibl_to_bids/test_aibl_utils.py index 99fcf20bc..d95b278c3 100644 --- a/test/unittests/iotools/converters/aibl_to_bids/test_aibl_utils.py +++ b/test/unittests/iotools/converters/aibl_to_bids/test_aibl_utils.py @@ -1,6 +1,73 @@ +from pathlib import Path + +import numpy as np +import pandas as pd import pytest +@pytest.mark.parametrize( + "visit, visit_list, date_list, expected", + [ + ("bl", ["bl", "m10"], ["01/01/2000", -4], None), + ("m10", ["bl", "m10"], ["01/01/2000", -4], "11/01/2000"), + ("m0006", ["bl"], ["01/01/2000"], "07/01/2000"), + ], +) +def test_compute_exam_date_from_baseline_success( + visit, date_list, visit_list, expected +): + from clinica.iotools.converters.aibl_to_bids.utils.clinical import ( + _compute_exam_date_from_baseline, + ) + + assert _compute_exam_date_from_baseline(visit, date_list, visit_list) == expected + + +def test_compute_exam_date_from_baseline_raiseValue(): + from clinica.iotools.converters.aibl_to_bids.utils.clinical import ( + _compute_exam_date_from_baseline, + ) + + with pytest.raises( + ValueError, + match=f"Unexpected visit code foo. Should be in format mX :" + "Ex: m0, m6, m12, m048...", + ): + _compute_exam_date_from_baseline("foo", [], []) + + +def test_find_exam_date_in_other_csv_files(): + pass + + +def test_clean_exam_dates(): + pass + + +def test_load_specifications_success(tmp_path): + from clinica.iotools.converters.aibl_to_bids.utils.clinical import ( + _load_specifications, + ) + + filename = "foo.tsv" + file = pd.DataFrame(columns=["foo"]) + file.to_csv(tmp_path / filename, sep="\t", index=False) + assert _load_specifications(tmp_path, filename).equals(file) + + +def test_load_specifications_error_tmp_path(tmp_path): + from clinica.iotools.converters.aibl_to_bids.utils.clinical import ( + _load_specifications, + ) + + with pytest.raises( + FileNotFoundError, + match=f"The specifications for bar.tsv were not found. " + f"The should be located in {tmp_path/'bar.tsv'}.", + ): + _load_specifications(tmp_path, "bar.tsv") + + def test_listdir_nohidden(tmp_path): from clinica.iotools.converters.aibl_to_bids.utils.bids import _listdir_nohidden @@ -40,3 +107,162 @@ def test_get_first_file_matching_pattern_error(tmp_path, pattern, msg): with pytest.raises(ValueError, match=msg): _get_first_file_matching_pattern(tmp_path, pattern) + + +@pytest.mark.parametrize( + "birth_date, exam_date, age", + [ + ( + "/2000", + ["01/02/2000", "02/01/2000", "01/01/2001", "07/06/2003"], + [0, 0, 1, 3], + ), + ("/2001", ["12/30/2003"], [2]), + ], +) +def test_compute_age(birth_date, exam_date, age): + from clinica.iotools.converters.aibl_to_bids.utils.clinical import ( + _compute_ages_at_each_exam, + ) + + assert _compute_ages_at_each_exam(birth_date, exam_date) == age + + +def build_sessions_spec(tmp_path: Path) -> Path: + spec = pd.DataFrame( + { + "BIDS CLINICA": [ + "examination_date", + "age", + "cdr_global", + "MMS", + "diagnosis", + ], + "AIBL": ["EXAMDATE", "PTDOB", "CDGLOBAL", "MMSCORE", "DXCURREN"], + "AIBL location": [ + "aibl_neurobat_*.csv", + "aibl_ptdemog_*.csv", + "aibl_cdr_*.csv", + "aibl_mmse_*.csv", + "aibl_pdxconv_*.csv", + ], + } + ) + spec.to_csv(tmp_path / "sessions.tsv", index=False, sep="\t") + return tmp_path + + +def build_bids_dir(tmp_path: Path) -> Path: + bids_dir = tmp_path / "BIDS" + bids_dir.mkdir() + (bids_dir / "sub-AIBL1" / "ses-M000").mkdir(parents=True) + (bids_dir / "sub-AIBL100" / "ses-M000").mkdir(parents=True) + (bids_dir / "sub-AIBL100" / "ses-M012").mkdir(parents=True) + return bids_dir + + +def build_clinical_data(tmp_path: Path) -> Path: + data_path = tmp_path / "clinical_data" + data_path.mkdir() + + neuro = pd.DataFrame( + { + "RID": [1, 2, 12, 100, 100], # %m/%d/%Y + "VISCODE": ["bl", "bl", "bl", "bl", "m12"], + "EXAMDATE": [ + "01/01/2001", + "01/01/2002", + "01/01/2012", + "01/01/2100", + "12/01/2100", + ], + } + ) + neuro.to_csv(data_path / "aibl_neurobat_230ct2024.csv", index=False) + + ptdemog = pd.DataFrame( + { + "RID": [1, 2, 12, 101], + "VISCODE": ["bl", "bl", "bl", "bl"], + "PTDOB": ["/1901", "/1902", "/1912", "/2001"], + } + ) + ptdemog.to_csv(data_path / "aibl_ptdemog_230ct2024.csv", index=False) + + cdr = pd.DataFrame( + { + "RID": [1, 2, 12, 100, 100], + "VISCODE": ["bl", "bl", "bl", "bl", "m12"], + "CDGLOBAL": [-4, 1, 0.5, 0, 0], + } + ) # rq:float + cdr.to_csv(data_path / "aibl_cdr_230ct2024.csv", index=False) + + mmse = pd.DataFrame( + { + "RID": [1, 2, 12, 100, 100], + "VISCODE": ["bl", "bl", "bl", "bl", "m12"], + "MMSCORE": [-4, 10, 10, 30, 29], + } + ) # rq:int + mmse.to_csv(data_path / "aibl_mmse_230ct2024.csv", index=False) + + pdx = pd.DataFrame( + { + "RID": [1, 2, 12, 100, 100], + "VISCODE": ["bl", "bl", "bl", "bl", "m12"], + "DXCURREN": [-4, 0, 0, 1, 3], + } + ) # rq : int + pdx.to_csv(data_path / "aibl_pdxconv_230ct2024.csv", index=False) + + return data_path + + +def test_create_sessions_tsv(tmp_path): + from clinica.iotools.converters.aibl_to_bids.utils.clinical import ( + create_sessions_tsv_file, + ) + + bids_path = build_bids_dir(tmp_path) + + create_sessions_tsv_file( + input_path=bids_path, + clinical_data_dir=build_clinical_data(tmp_path), + clinical_specifications_folder=build_sessions_spec(tmp_path), + ) + result_sub100_list = list(bids_path.rglob("*sub-AIBL100_sessions.tsv")) + result_sub1_list = list(bids_path.rglob("*sub-AIBL1_sessions.tsv")) + + assert len(result_sub100_list) == 1 + assert len(result_sub1_list) == 1 + + result_sub100 = pd.read_csv(result_sub100_list[0], sep="\t") + result_sub1 = pd.read_csv(result_sub1_list[0], sep="\t") + + expected_sub100 = pd.DataFrame( + { + "session_id": ["ses-M000", "ses-M012"], + "months": [0, 12], + "age": [np.nan, np.nan], + "MMS": [30, 29], + "cdr_global": [0.0, 0.0], + "diagnosis": ["CN", "AD"], + "examination_date": ["01/01/2100", "12/01/2100"], + } + ) + + expected_sub1 = pd.DataFrame( + { + "session_id": ["ses-M000"], + "months": [0], + "age": [100], + "MMS": [np.nan], + "cdr_global": [np.nan], + "diagnosis": [np.nan], + "examination_date": ["01/01/2001"], + } + ) + + assert expected_sub1.equals(result_sub1) + assert expected_sub100.equals(result_sub100)