From 09fc59c2840dec5856ca6414974d428bfdbf4e66 Mon Sep 17 00:00:00 2001
From: AliceJoubert <158147135+AliceJoubert@users.noreply.github.com>
Date: Fri, 25 Oct 2024 08:52:11 +0200
Subject: [PATCH] [TEST] AIBL-to-BIDS : add unit tests to functions related to
 creating sessions.tsv files (#1347)

* wip1

* wip2

* Add tests for load_spec

* Ref/add test compute_exam_date_from_bl

* End of first proposition

* Small fix
---
 .../converters/aibl_to_bids/utils/clinical.py | 112 ++++++---
 .../aibl_to_bids/test_aibl_utils.py           | 226 ++++++++++++++++++
 2 files changed, 310 insertions(+), 28 deletions(-)

diff --git a/clinica/iotools/converters/aibl_to_bids/utils/clinical.py b/clinica/iotools/converters/aibl_to_bids/utils/clinical.py
index 9411b23b1..61540925e 100644
--- a/clinica/iotools/converters/aibl_to_bids/utils/clinical.py
+++ b/clinica/iotools/converters/aibl_to_bids/utils/clinical.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import pandas as pd
 
@@ -153,12 +153,12 @@ def create_sessions_tsv_file(
     clinical_data_dir: Path,
     clinical_specifications_folder: Path,
 ) -> None:
-    """Extract the information regarding the sessions and save them in a tsv file.
+    """Extract the information regarding a subject sessions and save them in a tsv file.
 
     Parameters
     ----------
     input_path : Path
-        The path to the input folder.
+        The path to the input folder (BIDS directory).
 
     clinical_data_dir : Path
         The path to the directory to the clinical data files.
@@ -211,7 +211,9 @@ def create_sessions_tsv_file(
 
                 elif field in list(df.columns.values) and field == "CDGLOBAL":
                     cd_global = df.loc[(df["RID"] == rid), field]
-                    cd_global[cd_global == -4] = "n/a"
+                    cd_global[
+                        cd_global == -4
+                    ] = "n/a"  # todo (LATER) : do that mapping later, same for other fields
 
                 elif field in list(df.columns.values) and field == "DXCURREN":
                     dx_curren = df.loc[(df["RID"] == rid), field]
@@ -229,7 +231,13 @@ def create_sessions_tsv_file(
         exam_dates = _clean_exam_dates(
             rid, exam_date.to_list(), visit_code.to_list(), clinical_data_dir
         )
-        age = _compute_ages_at_each_exam(patient_date_of_birth.values[0], exam_dates)
+
+        if not patient_date_of_birth.empty:
+            age = _compute_ages_at_each_exam(
+                patient_date_of_birth.values[0], exam_dates
+            )
+        else:
+            age = "n/a"
 
         visit_code[visit_code == "bl"] = "M000"
         visit_code = visit_code.str.upper()
@@ -244,6 +252,7 @@ def create_sessions_tsv_file(
                 "examination_date": exam_dates,
             }
         )
+        # todo (LATER) : pretty sure there exists a function that converts viscode to session
         sessions = sessions.assign(
             session_id=lambda df: df.months.apply(lambda x: f"ses-M{int(x):03d}")
         )
@@ -263,9 +272,29 @@ def create_sessions_tsv_file(
 
 
 def _clean_exam_dates(
-    rid: str, exam_dates: List[str], visit_codes: List[str], clinical_data_dir: Path
-) -> List[str]:
-    """Clean the exam dates when necessary by trying to compute them from other sources."""
+    rid: int,
+    exam_dates: List[Union[str, int]],
+    visit_codes: List[str],
+    clinical_data_dir: Path,
+) -> List[Union[str, int]]:
+    """Clean the exam dates when necessary by trying to compute them from other sources.
+
+    Parameters
+    ----------
+    rid : int
+        Patient study/source id
+    exam_dates : List
+        Ex : ['10/12/2007', '05/29/2009', '10/25/2010', -4]
+    visit_codes : List
+        Ex : ['bl', 'm18', 'm36', 'm54']
+    clinical_data_dir : Path
+
+    Returns
+    -------
+    List of cleaned exam dates
+
+    """
+
     from clinica.utils.stream import cprint
 
     exam_dates_cleaned: List[str] = []
@@ -275,7 +304,10 @@ def _clean_exam_dates(
                 rid, visit_code, clinical_data_dir
             ) or _compute_exam_date_from_baseline(visit_code, exam_dates, visit_codes)
             if not exam_date:
-                cprint(f"No EXAMDATE for subject %{rid}, at session {visit_code}")
+                cprint(
+                    f"No EXAMDATE for subject %{rid}, at session {visit_code}",
+                    lvl="debug",
+                )
                 exam_date = "-4"
         exam_dates_cleaned.append(exam_date)
 
@@ -283,10 +315,11 @@ def _clean_exam_dates(
 
 
 def _find_exam_date_in_other_csv_files(
-    rid: str, visit_code: str, clinical_data_dir: Path
+    rid: int, visit_code: str, clinical_data_dir: Path
 ) -> Optional[str]:
     """Try to find an alternative exam date by searching in other CSV files."""
-    for csv_file in _get_cvs_files(clinical_data_dir):
+    # todo (LATER) : refactor and test depending on _get_csv_files
+    for csv_file in _get_csv_files(clinical_data_dir):
         if "aibl_flutemeta" in csv_file:
             csv_data = pd.read_csv(
                 csv_file, low_memory=False, usecols=list(range(0, 36))
@@ -299,9 +332,11 @@ def _find_exam_date_in_other_csv_files(
     return None
 
 
-def _get_cvs_files(clinical_data_dir: Path) -> List[str]:
+def _get_csv_files(clinical_data_dir: Path) -> List[str]:
     """Return a list of paths to CSV files in which an alternative exam date could be found."""
     import glob
+    # todo (LATER) : would be better to use a function similar to load_clinical_csv from ADNI
+    # bc there it does not check for existence and can return anything
 
     return [
         glob.glob(str(clinical_data_dir / pattern))[0]
@@ -317,26 +352,41 @@ def _get_cvs_files(clinical_data_dir: Path) -> List[str]:
 
 
 def _compute_exam_date_from_baseline(
-    visit_code: str, exam_dates: List[str], visit_codes: List[str]
+    visit_code: str, exam_dates: List[Union[str, int]], visit_codes: List[str]
 ) -> Optional[str]:
-    """Try to find an alternative exam date by computing the number of months from the visit code."""
+    """Try to find an alternative exam date by computing the number of months from the visit code.
+
+    Parameters
+    ----------
+    visit_code : Visit code of the current analysed session
+    exam_dates : List of all the exam_dates for one subject (ex : ['01/01/2000', -4])
+    visit_codes : List of all the visit codes for one subject (ex : ['bl', 'm12']
+
+    Returns
+    -------
+    Either None or the calculated date (str)
+    """
+    # todo (LATER) : this function could use a refactor though,
+    # for the same output you could just use the visitcode and the date at baseline (no need to use the lists)
+    # assuming you know it. There it returns none if its baseline ? why not the baseline date ?
+
+    import re
     from datetime import datetime
 
     from dateutil.relativedelta import relativedelta
 
-    baseline_index = visit_codes.index("bl")
-    if baseline_index > -1:
+    if visit_code != "bl":
+        try:
+            months = int(re.match(r"m(\d*)", visit_code).group(1))
+        except AttributeError:
+            raise ValueError(
+                f"Unexpected visit code {visit_code}. Should be in format mX :"
+                "Ex: m0, m6, m12, m048..."
+            )
+        baseline_index = visit_codes.index("bl")
         baseline_date = datetime.strptime(exam_dates[baseline_index], "%m/%d/%Y")
-        if visit_code != "bl":
-            try:
-                months = int(visit_code[1:])
-            except TypeError:
-                raise ValueError(
-                    f"Unexpected visit code {visit_code}. Should be in format MXXX."
-                    "Ex: M000, M006, M048..."
-                )
-            exam_date = baseline_date + relativedelta(months=+months)
-            return exam_date.strftime("%m/%d/%Y")
+        exam_date = baseline_date + relativedelta(months=+months)
+        return exam_date.strftime("%m/%d/%Y")
     return None
 
 
@@ -365,8 +415,14 @@ def _compute_ages_at_each_exam(
 
     for exam_date in exam_dates:
         exam_date = datetime.strptime(exam_date, "%m/%d/%Y")
-        delta = exam_date - date_of_birth
-        ages.append(round(delta.days / 365.25, 1))
+        delta = exam_date.year - date_of_birth.year
+        ages.append(delta)
+
+    # todo (NOW) :
+    #  rq : what is the use of being so precise ? we are comparing a year with a full date.. that's false anyway
+    #  we could give ages in years (int, >=0) and just subtract the years
+
+    # todo (LATER) : what happens if wrong format ? or exam < birth for some reason ?
 
     return ages
 
diff --git a/test/unittests/iotools/converters/aibl_to_bids/test_aibl_utils.py b/test/unittests/iotools/converters/aibl_to_bids/test_aibl_utils.py
index 99fcf20bc..d95b278c3 100644
--- a/test/unittests/iotools/converters/aibl_to_bids/test_aibl_utils.py
+++ b/test/unittests/iotools/converters/aibl_to_bids/test_aibl_utils.py
@@ -1,6 +1,73 @@
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
 import pytest
 
 
+@pytest.mark.parametrize(
+    "visit, visit_list, date_list, expected",
+    [
+        ("bl", ["bl", "m10"], ["01/01/2000", -4], None),
+        ("m10", ["bl", "m10"], ["01/01/2000", -4], "11/01/2000"),
+        ("m0006", ["bl"], ["01/01/2000"], "07/01/2000"),
+    ],
+)
+def test_compute_exam_date_from_baseline_success(
+    visit, date_list, visit_list, expected
+):
+    from clinica.iotools.converters.aibl_to_bids.utils.clinical import (
+        _compute_exam_date_from_baseline,
+    )
+
+    assert _compute_exam_date_from_baseline(visit, date_list, visit_list) == expected
+
+
+def test_compute_exam_date_from_baseline_raiseValue():
+    from clinica.iotools.converters.aibl_to_bids.utils.clinical import (
+        _compute_exam_date_from_baseline,
+    )
+
+    with pytest.raises(
+        ValueError,
+        match=f"Unexpected visit code foo. Should be in format mX :"
+        "Ex: m0, m6, m12, m048...",
+    ):
+        _compute_exam_date_from_baseline("foo", [], [])
+
+
+def test_find_exam_date_in_other_csv_files():
+    pass
+
+
+def test_clean_exam_dates():
+    pass
+
+
+def test_load_specifications_success(tmp_path):
+    from clinica.iotools.converters.aibl_to_bids.utils.clinical import (
+        _load_specifications,
+    )
+
+    filename = "foo.tsv"
+    file = pd.DataFrame(columns=["foo"])
+    file.to_csv(tmp_path / filename, sep="\t", index=False)
+    assert _load_specifications(tmp_path, filename).equals(file)
+
+
+def test_load_specifications_error_tmp_path(tmp_path):
+    from clinica.iotools.converters.aibl_to_bids.utils.clinical import (
+        _load_specifications,
+    )
+
+    with pytest.raises(
+        FileNotFoundError,
+        match=f"The specifications for bar.tsv were not found. "
+        f"The should be located in {tmp_path/'bar.tsv'}.",
+    ):
+        _load_specifications(tmp_path, "bar.tsv")
+
+
 def test_listdir_nohidden(tmp_path):
     from clinica.iotools.converters.aibl_to_bids.utils.bids import _listdir_nohidden
 
@@ -40,3 +107,162 @@ def test_get_first_file_matching_pattern_error(tmp_path, pattern, msg):
 
     with pytest.raises(ValueError, match=msg):
         _get_first_file_matching_pattern(tmp_path, pattern)
+
+
+@pytest.mark.parametrize(
+    "birth_date, exam_date, age",
+    [
+        (
+            "/2000",
+            ["01/02/2000", "02/01/2000", "01/01/2001", "07/06/2003"],
+            [0, 0, 1, 3],
+        ),
+        ("/2001", ["12/30/2003"], [2]),
+    ],
+)
+def test_compute_age(birth_date, exam_date, age):
+    from clinica.iotools.converters.aibl_to_bids.utils.clinical import (
+        _compute_ages_at_each_exam,
+    )
+
+    assert _compute_ages_at_each_exam(birth_date, exam_date) == age
+
+
+def build_sessions_spec(tmp_path: Path) -> Path:
+    spec = pd.DataFrame(
+        {
+            "BIDS CLINICA": [
+                "examination_date",
+                "age",
+                "cdr_global",
+                "MMS",
+                "diagnosis",
+            ],
+            "AIBL": ["EXAMDATE", "PTDOB", "CDGLOBAL", "MMSCORE", "DXCURREN"],
+            "AIBL location": [
+                "aibl_neurobat_*.csv",
+                "aibl_ptdemog_*.csv",
+                "aibl_cdr_*.csv",
+                "aibl_mmse_*.csv",
+                "aibl_pdxconv_*.csv",
+            ],
+        }
+    )
+    spec.to_csv(tmp_path / "sessions.tsv", index=False, sep="\t")
+    return tmp_path
+
+
+def build_bids_dir(tmp_path: Path) -> Path:
+    bids_dir = tmp_path / "BIDS"
+    bids_dir.mkdir()
+    (bids_dir / "sub-AIBL1" / "ses-M000").mkdir(parents=True)
+    (bids_dir / "sub-AIBL100" / "ses-M000").mkdir(parents=True)
+    (bids_dir / "sub-AIBL100" / "ses-M012").mkdir(parents=True)
+    return bids_dir
+
+
+def build_clinical_data(tmp_path: Path) -> Path:
+    data_path = tmp_path / "clinical_data"
+    data_path.mkdir()
+
+    neuro = pd.DataFrame(
+        {
+            "RID": [1, 2, 12, 100, 100],  # %m/%d/%Y
+            "VISCODE": ["bl", "bl", "bl", "bl", "m12"],
+            "EXAMDATE": [
+                "01/01/2001",
+                "01/01/2002",
+                "01/01/2012",
+                "01/01/2100",
+                "12/01/2100",
+            ],
+        }
+    )
+    neuro.to_csv(data_path / "aibl_neurobat_230ct2024.csv", index=False)
+
+    ptdemog = pd.DataFrame(
+        {
+            "RID": [1, 2, 12, 101],
+            "VISCODE": ["bl", "bl", "bl", "bl"],
+            "PTDOB": ["/1901", "/1902", "/1912", "/2001"],
+        }
+    )
+    ptdemog.to_csv(data_path / "aibl_ptdemog_230ct2024.csv", index=False)
+
+    cdr = pd.DataFrame(
+        {
+            "RID": [1, 2, 12, 100, 100],
+            "VISCODE": ["bl", "bl", "bl", "bl", "m12"],
+            "CDGLOBAL": [-4, 1, 0.5, 0, 0],
+        }
+    )  # rq:float
+    cdr.to_csv(data_path / "aibl_cdr_230ct2024.csv", index=False)
+
+    mmse = pd.DataFrame(
+        {
+            "RID": [1, 2, 12, 100, 100],
+            "VISCODE": ["bl", "bl", "bl", "bl", "m12"],
+            "MMSCORE": [-4, 10, 10, 30, 29],
+        }
+    )  # rq:int
+    mmse.to_csv(data_path / "aibl_mmse_230ct2024.csv", index=False)
+
+    pdx = pd.DataFrame(
+        {
+            "RID": [1, 2, 12, 100, 100],
+            "VISCODE": ["bl", "bl", "bl", "bl", "m12"],
+            "DXCURREN": [-4, 0, 0, 1, 3],
+        }
+    )  # rq : int
+    pdx.to_csv(data_path / "aibl_pdxconv_230ct2024.csv", index=False)
+
+    return data_path
+
+
+def test_create_sessions_tsv(tmp_path):
+    from clinica.iotools.converters.aibl_to_bids.utils.clinical import (
+        create_sessions_tsv_file,
+    )
+
+    bids_path = build_bids_dir(tmp_path)
+
+    create_sessions_tsv_file(
+        input_path=bids_path,
+        clinical_data_dir=build_clinical_data(tmp_path),
+        clinical_specifications_folder=build_sessions_spec(tmp_path),
+    )
+    result_sub100_list = list(bids_path.rglob("*sub-AIBL100_sessions.tsv"))
+    result_sub1_list = list(bids_path.rglob("*sub-AIBL1_sessions.tsv"))
+
+    assert len(result_sub100_list) == 1
+    assert len(result_sub1_list) == 1
+
+    result_sub100 = pd.read_csv(result_sub100_list[0], sep="\t")
+    result_sub1 = pd.read_csv(result_sub1_list[0], sep="\t")
+
+    expected_sub100 = pd.DataFrame(
+        {
+            "session_id": ["ses-M000", "ses-M012"],
+            "months": [0, 12],
+            "age": [np.nan, np.nan],
+            "MMS": [30, 29],
+            "cdr_global": [0.0, 0.0],
+            "diagnosis": ["CN", "AD"],
+            "examination_date": ["01/01/2100", "12/01/2100"],
+        }
+    )
+
+    expected_sub1 = pd.DataFrame(
+        {
+            "session_id": ["ses-M000"],
+            "months": [0],
+            "age": [100],
+            "MMS": [np.nan],
+            "cdr_global": [np.nan],
+            "diagnosis": [np.nan],
+            "examination_date": ["01/01/2001"],
+        }
+    )
+
+    assert expected_sub1.equals(result_sub1)
+    assert expected_sub100.equals(result_sub100)