From 7f47ed47fa3f2d633e5b3086c5ba1ddc04d99d98 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 25 Jul 2024 14:28:54 +0100 Subject: [PATCH 01/12] CU-869574kvp: Add pattern based release version identifying for Snomed preprocessing --- medcat/utils/preprocess_snomed.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/medcat/utils/preprocess_snomed.py b/medcat/utils/preprocess_snomed.py index 20409a481..8a5449dfd 100644 --- a/medcat/utils/preprocess_snomed.py +++ b/medcat/utils/preprocess_snomed.py @@ -74,10 +74,12 @@ class Snomed: uk_drug_ext (bool, optional): Specifies whether the version is a SNOMED UK drug extension. Defaults to False. au_ext (bool, optional): Specifies wether the version is a AU release. Defaults to False. """ + SNOMED_RELEASE_PATTERN = re.compile("^SnomedCT_([A-Za-z0-9]+)_([A-Za-z0-9]+)_(\d{8}T\d{6}Z$)") + NO_VERSION_DETECTED = 'N/A' def __init__(self, data_path, uk_ext=False, uk_drug_ext=False, au_ext: bool = False): self.data_path = data_path - self.release = data_path[-16:-8] + self.release = self._determine_release(data_path, strict=False) self.uk_ext = uk_ext self.uk_drug_ext = uk_drug_ext self.opcs_refset_id = "1126441000000105" @@ -95,6 +97,17 @@ def __init__(self, data_path, uk_ext=False, uk_drug_ext=False, au_ext: bool = Fa raise ValueError("Cannot both be a UK and and a AU version. " f"Got UK={uk_ext}, UK_Drug={uk_drug_ext}, AU={au_ext}") + @classmethod + def _determine_release(cls, folder_path: str, strict: bool = True, + _group_nr: int = 3, _keep_chars: int = 8) -> str: + folder_basename = os.path.basename(folder_path) + match = cls.SNOMED_RELEASE_PATTERN.match(folder_basename) + if match is None and strict: + raise UnkownSnomedReleaseException(f"No version found in '{folder_path}'") + elif match is None: + return cls.NO_VERSION_DETECTED + return match.group(_group_nr)[:_keep_chars] + def to_concept_df(self): """ Create a SNOMED CT concept DataFrame. @@ -368,7 +381,8 @@ def _check_path_and_release(self): for folder in os.listdir(self.data_path): if "SnomedCT" in folder: paths.append(os.path.join(self.data_path, folder)) - snomed_releases.append(folder[-16:-8]) + rel = self._determine_release(folder, strict=True) + snomed_releases.append(rel) if len(paths) == 0: raise FileNotFoundError('Incorrect path to SNOMED CT directory') return paths, snomed_releases @@ -447,3 +461,9 @@ def _map_snomed2refset(self): return icd10_df, opcs_df else: return mapping_df + + +class UnkownSnomedReleaseException(ValueError): + + def __init__(self, *args) -> None: + super().__init__(*args) From 4e662d91b7d5ff68701e016804dcbefc4c7f2111 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 25 Jul 2024 14:29:21 +0100 Subject: [PATCH 02/12] CU-869574kvp: Add tests for pattern-based snomed release identification --- tests/utils/test_preprocess_snomed.py | 70 +++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/tests/utils/test_preprocess_snomed.py b/tests/utils/test_preprocess_snomed.py index 59a00f6fc..535618f0f 100644 --- a/tests/utils/test_preprocess_snomed.py +++ b/tests/utils/test_preprocess_snomed.py @@ -1,3 +1,4 @@ +import os from typing import Dict from medcat.utils import preprocess_snomed @@ -62,3 +63,72 @@ def test_old_gets_old_OPCS4_mapping_uk_ext(self): def test_new_gets_new_OCPS4_mapping_uk_ext(self): snomed = preprocess_snomed.Snomed(EXAMPLE_SNOMED_PATH_NEW, uk_ext=True) self.assertEqual(snomed.opcs_refset_id, "1382401000000109") + + +class TestSnomedModelGetter(unittest.TestCase): + WORKING_BASE_NAMES = [ + "SnomedCT_InternationalRF2_PRODUCTION_20240201T120000Z", + "SnomedCT_InternationalRF2_PRODUCTION_20240601T120000Z", + "SnomedCT_UKClinicalRF2_PRODUCTION_20240410T000001Z", + "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION_20240410T000001Z", + "SnomedCT_UKDrugRF2_PRODUCTION_20240508T000001Z", + "SnomedCT_UKEditionRF2_PRODUCTION_20240410T000001Z", + "SnomedCT_UKEditionRF2_PRODUCTION_20240508T000001Z", + "SnomedCT_Release_AU1000036_20240630T120000Z", + ] + FAILING_BASE_NAMES = [ + "uk_sct2cl_38.2.0_20240605000001Z", + "uk_sct2cl_32.6.0_20211027000001Z", + ] + PATH = os.path.join("path", "to", "release") + + def _pathify(self, in_list: list) -> list: + return [os.path.join(self.PATH, folder) for folder in in_list] + + def assert_got_version(self, snomed: preprocess_snomed.Snomed, raw_name: str): + rel = snomed.release + self.assertIsInstance(rel, str) + self.assertIn(rel, raw_name) + self.assertEqual(rel, raw_name[-16:-8]) + + def assert_all_work(self, all_paths: list): + for path in all_paths: + with self.subTest(f"Rrelease name: {path}"): + snomed = preprocess_snomed.Snomed(path) + self.assert_got_version(snomed, path) + + def test_gets_model_form_basename(self): + self.assert_all_work(self.WORKING_BASE_NAMES) + + def test_gets_model_from_path(self): + full_paths = self._pathify(self.WORKING_BASE_NAMES) + self.assert_all_work(full_paths) + + def assert_raises(self, folder_path: str): + with self.assertRaises(preprocess_snomed.UnkownSnomedReleaseException): + preprocess_snomed.Snomed._determine_release(folder_path, strict=True) + + def assert_all_raise(self, folder_paths: list): + for folder_path in folder_paths: + with self.subTest(f"Folder: {folder_path}"): + self.assert_raises(folder_path) + + def test_fails_on_incorrect_names_strict(self): + self.assert_all_raise(self.FAILING_BASE_NAMES) + + def test_fails_on_incorrect_paths_strict(self): + full_paths = self._pathify(self.FAILING_BASE_NAMES) + self.assert_all_raise(full_paths) + + def assert_all_get_no_version(self, folder_paths: list): + for folder_path in folder_paths: + with self.subTest(f"Folder: {folder_path}"): + snomed = preprocess_snomed.Snomed(folder_path) + self.assertEqual(snomed.release, preprocess_snomed.Snomed.NO_VERSION_DETECTED) + + def test_gets_no_version_incorrect_names_nonstrict(self): + self.assert_all_get_no_version(self.FAILING_BASE_NAMES) + + def test_gets_no_version_incorrect_paths_nonstrict(self): + full_paths = self._pathify(self.FAILING_BASE_NAMES) + self.assert_all_get_no_version(full_paths) From 1fc3a2c7c7b2b90a77de89d2a7254b7e6b7f98c5 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 10 Sep 2024 10:12:25 +0100 Subject: [PATCH 03/12] CU-869574kvp: Update Snomed preprocessing: Separate extensions into an Enum. Do the release/paths check at init to allow for early failures in case of issues --- medcat/utils/preprocess_snomed.py | 289 +++++++++++++++++------------- 1 file changed, 161 insertions(+), 128 deletions(-) diff --git a/medcat/utils/preprocess_snomed.py b/medcat/utils/preprocess_snomed.py index 48a3c8b05..0edb69402 100644 --- a/medcat/utils/preprocess_snomed.py +++ b/medcat/utils/preprocess_snomed.py @@ -3,6 +3,9 @@ import re import hashlib import pandas as pd +from typing import Optional, Dict +from collections import defaultdict +from enum import Enum def parse_file(filename, first_row_header=True, columns=None): @@ -61,6 +64,105 @@ def get_direct_refset_mapping(in_dict: dict) -> dict: return ret_dict +class SnapshotData: + def __init__(self, + concept_snapshots: Dict[str, Optional[str]], + description_snapshots: Dict[str, Optional[str]], + relationship_snapshots: Dict[str, Optional[str]], + refset_snapshots: Dict[str, Optional[str]]): + self.concept_snapshots = concept_snapshots + self.description_snapshots = description_snapshots + self.relationship_snapshots = relationship_snapshots + self.refset_snapshots = refset_snapshots + + +class SupportedExtensions(Enum): + INTERNATIONAL = SnapshotData( + defaultdict(lambda: "sct2_Concept_Snapshot"), + defaultdict(lambda: "sct2_Description_Snapshot-en"), + defaultdict(lambda: "sct2_Relationship_Snapshot"), + defaultdict(lambda: "der2_iisssccRefset_ExtendedMapSnapshot") + ) + UK = SnapshotData( + { + "SnomedCT_InternationalRF2_PRODUCTION": "sct2_Concept_Snapshot", + "SnomedCT_UKClinicalRF2_PRODUCTION": "sct2_Concept_UKCLSnapshot", + "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Concept_UKEDSnapshot", + "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION": None, # avoid + }, + { + "SnomedCT_InternationalRF2_PRODUCTION": "sct2_Description_Snapshot-en", + "SnomedCT_UKClinicalRF2_PRODUCTION": "sct2_Description_UKCLSnapshot-en", + "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Description_UKEDSnapshot-en", + "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION": None, # avoid + }, + { + "SnomedCT_InternationalRF2_PRODUCTION": "sct2_Relationship_Snapshot", + "SnomedCT_UKClinicalRF2_PRODUCTION": "sct2_Relationship_UKCLSnapshot", + "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Relationship_UKEDSnapshot", + "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION": None, # avoid + }, + { + "SnomedCT_InternationalRF2_PRODUCTION": None, # avoid + "SnomedCT_UKClinicalRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKCLSnapshot", + "SnomedCT_UKEditionRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKEDSnapshot", + "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION": None, # avoid + } + ) + UK_DRUG = SnapshotData( + { + "SnomedCT_UKDrugRF2_PRODUCTION": "sct2_Concept_UKDGSnapshot", + "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Concept_UKEDSnapshot", + "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION": None, # avoid + }, + { + "SnomedCT_UKDrugRF2_PRODUCTION": "sct2_Description_UKDGSnapshot-en", + "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Description_UKEDSnapshot-en", + "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION": None, # avoid + }, + { + "SnomedCT_InternationalRF2_PRODUCTION": "sct2_Relationship_Snapshot", + "SnomedCT_UKDrugRF2_PRODUCTION": "sct2_Relationship_UKDGSnapshot", + "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Description_UKEDSnapshot-en", + "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION": None, # avoid + }, + { + "SnomedCT_UKDrugRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKDGSnapshot", + "SnomedCT_UKEditionRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKEDSnapshot", + "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION": None, # avoid + } + ) + AU = SnapshotData( + defaultdict(lambda: "sct2_Concept_Snapshot"), + defaultdict(lambda: "sct2_Description_Snapshot-en-AU"), + defaultdict(lambda: "sct2_Relationship_Snapshot"), + defaultdict(lambda: "der2_iisssccRefset_ExtendedMapSnapshot") + ) + + def _get_appropriate_name(self, part: Dict[str, Optional[str]], cur_path: str + ) -> Optional[str]: + try: + return part[cur_path] + except KeyError: + pass + for k, v in part.items(): + if k in cur_path: + return v + return None + + def get_concept_snapshot(self, cur_path: str) -> Optional[str]: + return self._get_appropriate_name(self.value.concept_snapshots, cur_path) + + def get_description_snapshot(self, cur_path: str) -> Optional[str]: + return self._get_appropriate_name(self.value.description_snapshots, cur_path) + + def get_relationship_snapshot(self, cur_path: str) -> Optional[str]: + return self._get_appropriate_name(self.value.relationship_snapshots, cur_path) + + def get_refset_terminology(self, cur_path: str) -> Optional[str]: + return self._get_appropriate_name(self.value.refset_snapshots, cur_path) + + class Snomed: """ Pre-process SNOMED CT release files. @@ -77,25 +179,39 @@ class Snomed: SNOMED_RELEASE_PATTERN = re.compile("^SnomedCT_([A-Za-z0-9]+)_([A-Za-z0-9]+)_(\d{8}T\d{6}Z$)") NO_VERSION_DETECTED = 'N/A' - def __init__(self, data_path, uk_ext=False, uk_drug_ext=False, au_ext: bool = False): + def __init__(self, data_path): self.data_path = data_path - self.release = self._determine_release(data_path, strict=False) - self.uk_ext = uk_ext - self.uk_drug_ext = uk_drug_ext + self.paths, self.snomed_releases, self.exts = self._check_path_and_release() + + def _set_extension(self, release: str, extension: SupportedExtensions) -> None: self.opcs_refset_id = "1126441000000105" - if ((self.uk_ext or self.uk_drug_ext) and + if (extension in (SupportedExtensions.UK, SupportedExtensions.UK_DRUG) and # using lexicographical comparison below # e.g "20240101" > "20231122" results in True # yet "20231121" > "20231122" results in False - len(self.release) == len("20231122") and self.release >= "20231122"): + len(release) == len("20231122") and release >= "20231122"): # NOTE for UK extensions starting from 20231122 the # OPCS4 refset ID seems to be different self.opcs_refset_id = '1382401000000109' - self.au_ext = au_ext + self._extension = extension + + @classmethod + def _determine_extension(cls, folder_path: str) -> SupportedExtensions: + uk_ext = "SnomedCT_UK" in folder_path + uk_drug_ext = uk_ext and "Drug" in folder_path + au_ext = "_AU" in folder_path # validate - if (self.uk_ext or self.uk_drug_ext) and self.au_ext: - raise ValueError("Cannot both be a UK and and a AU version. " - f"Got UK={uk_ext}, UK_Drug={uk_drug_ext}, AU={au_ext}") + if (uk_ext or uk_drug_ext) and au_ext: + raise UnkownSnomedReleaseException( + "Cannot both be a UK and and a AU version. " + f"Got UK={uk_ext}, UK_Drug={uk_drug_ext}, AU={au_ext}") + if uk_drug_ext: + return SupportedExtensions.UK_DRUG + elif uk_ext: + return SupportedExtensions.UK + elif au_ext: + return SupportedExtensions.AU + return SupportedExtensions.INTERNATIONAL @classmethod def _determine_release(cls, folder_path: str, strict: bool = True, @@ -119,37 +235,15 @@ def to_concept_df(self): Returns: pandas.DataFrame: SNOMED CT concept DataFrame. """ - paths, snomed_releases = self._check_path_and_release() df2merge = [] - for i, snomed_release in enumerate(snomed_releases): - contents_path = os.path.join(paths[i], "Snapshot", "Terminology") - concept_snapshot = "sct2_Concept_Snapshot" - description_snapshot = "sct2_Description_Snapshot-en" - if self.au_ext: - description_snapshot += "-AU" - if self.uk_ext: - if "SnomedCT_UKClinicalRF2_PRODUCTION" in paths[i]: - concept_snapshot = "sct2_Concept_UKCLSnapshot" - description_snapshot = "sct2_Description_UKCLSnapshot-en" - elif "SnomedCT_UKEditionRF2_PRODUCTION" in paths[i]: - concept_snapshot = "sct2_Concept_UKEDSnapshot" - description_snapshot = "sct2_Description_UKEDSnapshot-en" - elif "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION" in paths[i]: - continue - else: - pass - if self.uk_drug_ext: - if "SnomedCT_UKDrugRF2_PRODUCTION" in paths[i]: - concept_snapshot = "sct2_Concept_UKDGSnapshot" - description_snapshot = "sct2_Description_UKDGSnapshot-en" - elif "SnomedCT_UKEditionRF2_PRODUCTION" in paths[i]: - concept_snapshot = "sct2_Concept_UKEDSnapshot" - description_snapshot = "sct2_Description_UKEDSnapshot-en" - elif "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION" in paths[i]: - continue - else: - pass + for i, snomed_release in enumerate(self.snomed_releases): + self._set_extension(snomed_release, self.exts[i]) + contents_path = os.path.join(self.paths[i], "Snapshot", "Terminology") + concept_snapshot = self._extension.get_concept_snapshot(self.paths[i]) + description_snapshot = self._extension.get_description_snapshot(self.paths[i]) + if concept_snapshot is None: + continue for f in os.listdir(contents_path): m = re.search(f'{concept_snapshot}'+r'_(.*)_\d*.txt', f) @@ -215,37 +309,14 @@ def list_all_relationships(self): Returns: list: List of all SNOMED CT relationships. """ - paths, snomed_releases = self._check_path_and_release() all_rela = [] - for i, snomed_release in enumerate(snomed_releases): - contents_path = os.path.join(paths[i], "Snapshot", "Terminology") - concept_snapshot = "sct2_Concept_Snapshot" - relationship_snapshot = "sct2_Relationship_Snapshot" - if self.uk_ext: - if "SnomedCT_InternationalRF2_PRODUCTION" in paths[i]: - concept_snapshot = "sct2_Concept_Snapshot" - relationship_snapshot = "sct2_Relationship_Snapshot" - elif "SnomedCT_UKClinicalRF2_PRODUCTION" in paths[i]: - concept_snapshot = "sct2_Concept_UKCLSnapshot" - relationship_snapshot = "sct2_Relationship_UKCLSnapshot" - elif "SnomedCT_UKEditionRF2_PRODUCTION" in paths[i]: - concept_snapshot = "sct2_Concept_UKEDSnapshot" - relationship_snapshot = "sct2_Relationship_UKEDSnapshot" - elif "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION" in paths[i]: - continue - else: - pass - if self.uk_drug_ext: - if "SnomedCT_UKDrugRF2_PRODUCTION" in paths[i]: - concept_snapshot = "sct2_Concept_UKDGSnapshot" - relationship_snapshot = "sct2_Relationship_UKDGSnapshot" - elif "SnomedCT_UKEditionRF2_PRODUCTION" in paths[i]: - concept_snapshot = "sct2_Concept_UKEDSnapshot" - relationship_snapshot = "sct2_Relationship_UKEDSnapshot" - elif "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION" in paths[i]: - continue - else: - pass + for i, snomed_release in enumerate(self.snomed_releases): + self._set_extension(snomed_release, self.exts[i]) + contents_path = os.path.join(self.paths[i], "Snapshot", "Terminology") + concept_snapshot = self._extension.get_concept_snapshot(self.paths[i]) + relationship_snapshot = self._extension.get_relationship_snapshot(self.paths[i]) + if concept_snapshot is None: + continue for f in os.listdir(contents_path): m = re.search(f'{concept_snapshot}'+r'_(.*)_\d*.txt', f) @@ -272,37 +343,14 @@ def relationship2json(self, relationshipcode, output_jsonfile): Returns: file: JSON file of relationship mapping. """ - paths, snomed_releases = self._check_path_and_release() output_dict = {} - for i, snomed_release in enumerate(snomed_releases): - contents_path = os.path.join(paths[i], "Snapshot", "Terminology") - concept_snapshot = "sct2_Concept_Snapshot" - relationship_snapshot = "sct2_Relationship_Snapshot" - if self.uk_ext: - if "SnomedCT_InternationalRF2_PRODUCTION" in paths[i]: - concept_snapshot = "sct2_Concept_Snapshot" - relationship_snapshot = "sct2_Relationship_Snapshot" - elif "SnomedCT_UKClinicalRF2_PRODUCTION" in paths[i]: - concept_snapshot = "sct2_Concept_UKCLSnapshot" - relationship_snapshot = "sct2_Relationship_UKCLSnapshot" - elif "SnomedCT_UKEditionRF2_PRODUCTION" in paths[i]: - concept_snapshot = "sct2_Concept_UKEDSnapshot" - relationship_snapshot = "sct2_Relationship_UKEDSnapshot" - elif "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION" in paths[i]: - continue - else: - pass - if self.uk_drug_ext: - if "SnomedCT_UKDrugRF2_PRODUCTION" in paths[i]: - concept_snapshot = "sct2_Concept_UKDGSnapshot" - relationship_snapshot = "sct2_Relationship_UKDGSnapshot" - elif "SnomedCT_UKEditionRF2_PRODUCTION" in paths[i]: - concept_snapshot = "sct2_Concept_UKEDSnapshot" - relationship_snapshot = "sct2_Relationship_UKEDSnapshot" - elif "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION" in paths[i]: - continue - else: - pass + for i, snomed_release in enumerate(self.snomed_releases): + self._set_extension(snomed_release, self.exts[i]) + contents_path = os.path.join(self.paths[i], "Snapshot", "Terminology") + concept_snapshot = self._extension.get_concept_snapshot(self.paths[i]) + relationship_snapshot = self._extension.get_relationship_snapshot(self.paths[i]) + if concept_snapshot is None: + continue for f in os.listdir(contents_path): m = re.search(f'{concept_snapshot}'+r'_(.*)_\d*.txt', f) @@ -335,7 +383,7 @@ def map_snomed2icd10(self): dict: A dictionary containing the SNOMED CT to ICD-10 mappings including metadata. """ snomed2icd10df = self._map_snomed2refset() - if self.uk_ext is True: + if self._extension in (SupportedExtensions.UK, SupportedExtensions.UK_DRUG): return self._refset_df2dict(snomed2icd10df[0]) else: return self._refset_df2dict(snomed2icd10df) @@ -353,7 +401,7 @@ def map_snomed2opcs4(self) -> dict: Returns: dict: A dictionary containing the SNOMED CT to OPCS-4 mappings including metadata. """ - if self.uk_ext is not True: + if self._extension not in (SupportedExtensions.UK, SupportedExtensions.UK_DRUG): raise AttributeError( "OPCS-4 mapping does not exist in this edition") snomed2opcs4df = self._map_snomed2refset()[1] @@ -374,18 +422,21 @@ def _check_path_and_release(self): """ snomed_releases = [] paths = [] + exts = [] if "Snapshot" in os.listdir(self.data_path): paths.append(self.data_path) - snomed_releases.append(self.release) + snomed_releases.append(self._determine_release(self.data_path, strict=True)) + exts.append(self._determine_extension(self.data_path)) else: for folder in os.listdir(self.data_path): if "SnomedCT" in folder: paths.append(os.path.join(self.data_path, folder)) rel = self._determine_release(folder, strict=True) snomed_releases.append(rel) + exts.append(self._determine_extension(paths[-1])) if len(paths) == 0: raise FileNotFoundError('Incorrect path to SNOMED CT directory') - return paths, snomed_releases + return paths, snomed_releases, exts def _refset_df2dict(self, refset_df: pd.DataFrame) -> dict: """ @@ -417,31 +468,13 @@ def _map_snomed2refset(self): OR tuple: Tuple of dataframes containing SNOMED CT to refset mappings and metadata (ICD-10, OPCS4), if uk_ext is True. """ - paths, snomed_releases = self._check_path_and_release() dfs2merge = [] - for i, snomed_release in enumerate(snomed_releases): - refset_terminology = f'{paths[i]}/Snapshot/Refset/Map' - icd10_ref_set = 'der2_iisssccRefset_ExtendedMapSnapshot' - if self.uk_ext: - if "SnomedCT_InternationalRF2_PRODUCTION" in paths[i]: - continue - elif "SnomedCT_UKClinicalRF2_PRODUCTION" in paths[i]: - icd10_ref_set = "der2_iisssciRefset_ExtendedMapUKCLSnapshot" - elif "SnomedCT_UKEditionRF2_PRODUCTION" in paths[i]: - icd10_ref_set = "der2_iisssciRefset_ExtendedMapUKEDSnapshot" - elif "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION" in paths[i]: - continue - else: - pass - if self.uk_drug_ext: - if "SnomedCT_UKDrugRF2_PRODUCTION" in paths[i]: - icd10_ref_set = "der2_iisssciRefset_ExtendedMapUKDGSnapshot" - elif "SnomedCT_UKEditionRF2_PRODUCTION" in paths[i]: - icd10_ref_set = "der2_iisssciRefset_ExtendedMapUKEDSnapshot" - elif "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION" in paths[i]: - continue - else: - pass + for i, snomed_release in enumerate(self.snomed_releases): + self._set_extension(snomed_release, self.exts[i]) + refset_terminology = f'{self.paths[i]}/Snapshot/Refset/Map' + icd10_ref_set = self._extension.get_refset_terminology(self.paths[i]) + if icd10_ref_set is None: + continue for f in os.listdir(refset_terminology): m = re.search(f'{icd10_ref_set}'+r'_(.*)_\d*.txt', f) if m: @@ -454,7 +487,7 @@ def _map_snomed2refset(self): dfs2merge.append(icd_mappings) mapping_df = pd.concat(dfs2merge) del dfs2merge - if self.uk_ext or self.uk_drug_ext: + if self._extension in (SupportedExtensions.UK, SupportedExtensions.UK_DRUG): opcs_df = mapping_df[mapping_df['refsetId'] == self.opcs_refset_id] icd10_df = mapping_df[mapping_df['refsetId'] == '999002271000000101'] From cca92e26d81723864bf5bfcc285e6531de611eb7 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 10 Sep 2024 12:20:08 +0100 Subject: [PATCH 04/12] CU-869574kvp: Simplify mappings somewhat. Move common avoids to a common location. Fix UK Drug relationship name --- medcat/utils/preprocess_snomed.py | 49 +++++++++++++++---------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/medcat/utils/preprocess_snomed.py b/medcat/utils/preprocess_snomed.py index 0edb69402..d5b99f509 100644 --- a/medcat/utils/preprocess_snomed.py +++ b/medcat/utils/preprocess_snomed.py @@ -3,7 +3,7 @@ import re import hashlib import pandas as pd -from typing import Optional, Dict +from typing import Optional, Dict, List from collections import defaultdict from enum import Enum @@ -69,11 +69,27 @@ def __init__(self, concept_snapshots: Dict[str, Optional[str]], description_snapshots: Dict[str, Optional[str]], relationship_snapshots: Dict[str, Optional[str]], - refset_snapshots: Dict[str, Optional[str]]): + refset_snapshots: Dict[str, Optional[str]], + avoids: List[str] = ["SnomedCT_UKClinicalRefsetsRF2_PRODUCTION"]): self.concept_snapshots = concept_snapshots self.description_snapshots = description_snapshots self.relationship_snapshots = relationship_snapshots self.refset_snapshots = refset_snapshots + self.avoids = avoids + + def get_appropriate_name(self, part: Dict[str, Optional[str]], cur_path: str + ) -> Optional[str]: + try: + return part[cur_path] + except KeyError: + pass + for avoid in self.avoids: + if avoid in cur_path: + return None + for k, v in part.items(): + if k in cur_path: + return v + return None class SupportedExtensions(Enum): @@ -88,48 +104,40 @@ class SupportedExtensions(Enum): "SnomedCT_InternationalRF2_PRODUCTION": "sct2_Concept_Snapshot", "SnomedCT_UKClinicalRF2_PRODUCTION": "sct2_Concept_UKCLSnapshot", "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Concept_UKEDSnapshot", - "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION": None, # avoid }, { "SnomedCT_InternationalRF2_PRODUCTION": "sct2_Description_Snapshot-en", "SnomedCT_UKClinicalRF2_PRODUCTION": "sct2_Description_UKCLSnapshot-en", "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Description_UKEDSnapshot-en", - "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION": None, # avoid }, { "SnomedCT_InternationalRF2_PRODUCTION": "sct2_Relationship_Snapshot", "SnomedCT_UKClinicalRF2_PRODUCTION": "sct2_Relationship_UKCLSnapshot", "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Relationship_UKEDSnapshot", - "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION": None, # avoid }, { "SnomedCT_InternationalRF2_PRODUCTION": None, # avoid "SnomedCT_UKClinicalRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKCLSnapshot", "SnomedCT_UKEditionRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKEDSnapshot", - "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION": None, # avoid } ) UK_DRUG = SnapshotData( { "SnomedCT_UKDrugRF2_PRODUCTION": "sct2_Concept_UKDGSnapshot", "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Concept_UKEDSnapshot", - "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION": None, # avoid }, { "SnomedCT_UKDrugRF2_PRODUCTION": "sct2_Description_UKDGSnapshot-en", "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Description_UKEDSnapshot-en", - "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION": None, # avoid }, { "SnomedCT_InternationalRF2_PRODUCTION": "sct2_Relationship_Snapshot", "SnomedCT_UKDrugRF2_PRODUCTION": "sct2_Relationship_UKDGSnapshot", - "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Description_UKEDSnapshot-en", - "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION": None, # avoid + "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Relationship_UKEDSnapshot", }, { "SnomedCT_UKDrugRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKDGSnapshot", "SnomedCT_UKEditionRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKEDSnapshot", - "SnomedCT_UKClinicalRefsetsRF2_PRODUCTION": None, # avoid } ) AU = SnapshotData( @@ -139,28 +147,17 @@ class SupportedExtensions(Enum): defaultdict(lambda: "der2_iisssccRefset_ExtendedMapSnapshot") ) - def _get_appropriate_name(self, part: Dict[str, Optional[str]], cur_path: str - ) -> Optional[str]: - try: - return part[cur_path] - except KeyError: - pass - for k, v in part.items(): - if k in cur_path: - return v - return None - def get_concept_snapshot(self, cur_path: str) -> Optional[str]: - return self._get_appropriate_name(self.value.concept_snapshots, cur_path) + return self.value.get_appropriate_name(self.value.concept_snapshots, cur_path) def get_description_snapshot(self, cur_path: str) -> Optional[str]: - return self._get_appropriate_name(self.value.description_snapshots, cur_path) + return self.value.get_appropriate_name(self.value.description_snapshots, cur_path) def get_relationship_snapshot(self, cur_path: str) -> Optional[str]: - return self._get_appropriate_name(self.value.relationship_snapshots, cur_path) + return self.value.get_appropriate_name(self.value.relationship_snapshots, cur_path) def get_refset_terminology(self, cur_path: str) -> Optional[str]: - return self._get_appropriate_name(self.value.refset_snapshots, cur_path) + return self.value.get_appropriate_name(self.value.refset_snapshots, cur_path) class Snomed: From a7978911367b479627e31d8f564a861ba0d030db Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 10 Sep 2024 12:28:08 +0100 Subject: [PATCH 05/12] CU-869574kvp: Simplify mappings somewhat more. Remove some clutter by separating common prefixes for release types and file names. --- medcat/utils/preprocess_snomed.py | 79 +++++++++++++++++-------------- 1 file changed, 44 insertions(+), 35 deletions(-) diff --git a/medcat/utils/preprocess_snomed.py b/medcat/utils/preprocess_snomed.py index d5b99f509..033f8a2e8 100644 --- a/medcat/utils/preprocess_snomed.py +++ b/medcat/utils/preprocess_snomed.py @@ -70,80 +70,88 @@ def __init__(self, description_snapshots: Dict[str, Optional[str]], relationship_snapshots: Dict[str, Optional[str]], refset_snapshots: Dict[str, Optional[str]], - avoids: List[str] = ["SnomedCT_UKClinicalRefsetsRF2_PRODUCTION"]): + avoids: List[str] = ["UKClinicalRefsetsRF2_PRODUCTION"], + common_key_prefix: str = "SnomedCT_", + common_val_prefix: str = "sct2_", # NOT for refset snapshot + ): self.concept_snapshots = concept_snapshots self.description_snapshots = description_snapshots self.relationship_snapshots = relationship_snapshots self.refset_snapshots = refset_snapshots self.avoids = avoids + self.common_key_prefix = common_key_prefix + self.common_val_prefix = common_val_prefix - def get_appropriate_name(self, part: Dict[str, Optional[str]], cur_path: str - ) -> Optional[str]: + def get_appropriate_name(self, part: Dict[str, Optional[str]], cur_path: str, + use_val_prefix: bool = True) -> Optional[str]: + val_prefix = self.common_val_prefix if use_val_prefix else '' try: - return part[cur_path] + return val_prefix + part[cur_path] except KeyError: pass - for avoid in self.avoids: + for raw_avoid in self.avoids: + avoid = self.common_key_prefix + raw_avoid if avoid in cur_path: return None - for k, v in part.items(): + for raw_k, v in part.items(): + k = self.common_key_prefix + raw_k if k in cur_path: - return v + return val_prefix + v return None class SupportedExtensions(Enum): INTERNATIONAL = SnapshotData( - defaultdict(lambda: "sct2_Concept_Snapshot"), - defaultdict(lambda: "sct2_Description_Snapshot-en"), - defaultdict(lambda: "sct2_Relationship_Snapshot"), + defaultdict(lambda: "Concept_Snapshot"), + defaultdict(lambda: "Description_Snapshot-en"), + defaultdict(lambda: "Relationship_Snapshot"), defaultdict(lambda: "der2_iisssccRefset_ExtendedMapSnapshot") ) UK = SnapshotData( { - "SnomedCT_InternationalRF2_PRODUCTION": "sct2_Concept_Snapshot", - "SnomedCT_UKClinicalRF2_PRODUCTION": "sct2_Concept_UKCLSnapshot", - "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Concept_UKEDSnapshot", + "InternationalRF2_PRODUCTION": "Concept_Snapshot", + "UKClinicalRF2_PRODUCTION": "Concept_UKCLSnapshot", + "UKEditionRF2_PRODUCTION": "Concept_UKEDSnapshot", }, { - "SnomedCT_InternationalRF2_PRODUCTION": "sct2_Description_Snapshot-en", - "SnomedCT_UKClinicalRF2_PRODUCTION": "sct2_Description_UKCLSnapshot-en", - "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Description_UKEDSnapshot-en", + "InternationalRF2_PRODUCTION": "Description_Snapshot-en", + "UKClinicalRF2_PRODUCTION": "Description_UKCLSnapshot-en", + "UKEditionRF2_PRODUCTION": "Description_UKEDSnapshot-en", }, { - "SnomedCT_InternationalRF2_PRODUCTION": "sct2_Relationship_Snapshot", - "SnomedCT_UKClinicalRF2_PRODUCTION": "sct2_Relationship_UKCLSnapshot", - "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Relationship_UKEDSnapshot", + "InternationalRF2_PRODUCTION": "Relationship_Snapshot", + "UKClinicalRF2_PRODUCTION": "Relationship_UKCLSnapshot", + "UKEditionRF2_PRODUCTION": "Relationship_UKEDSnapshot", }, { - "SnomedCT_InternationalRF2_PRODUCTION": None, # avoid - "SnomedCT_UKClinicalRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKCLSnapshot", - "SnomedCT_UKEditionRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKEDSnapshot", + "InternationalRF2_PRODUCTION": None, # avoid + "UKClinicalRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKCLSnapshot", + "UKEditionRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKEDSnapshot", } ) UK_DRUG = SnapshotData( { - "SnomedCT_UKDrugRF2_PRODUCTION": "sct2_Concept_UKDGSnapshot", - "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Concept_UKEDSnapshot", + "UKDrugRF2_PRODUCTION": "Concept_UKDGSnapshot", + "UKEditionRF2_PRODUCTION": "Concept_UKEDSnapshot", }, { - "SnomedCT_UKDrugRF2_PRODUCTION": "sct2_Description_UKDGSnapshot-en", - "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Description_UKEDSnapshot-en", + "UKDrugRF2_PRODUCTION": "Description_UKDGSnapshot-en", + "UKEditionRF2_PRODUCTION": "Description_UKEDSnapshot-en", }, { - "SnomedCT_InternationalRF2_PRODUCTION": "sct2_Relationship_Snapshot", - "SnomedCT_UKDrugRF2_PRODUCTION": "sct2_Relationship_UKDGSnapshot", - "SnomedCT_UKEditionRF2_PRODUCTION": "sct2_Relationship_UKEDSnapshot", + "InternationalRF2_PRODUCTION": "Relationship_Snapshot", + "UKDrugRF2_PRODUCTION": "Relationship_UKDGSnapshot", + "UKEditionRF2_PRODUCTION": "Relationship_UKEDSnapshot", }, { - "SnomedCT_UKDrugRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKDGSnapshot", - "SnomedCT_UKEditionRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKEDSnapshot", + "UKDrugRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKDGSnapshot", + "UKEditionRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKEDSnapshot", } ) AU = SnapshotData( - defaultdict(lambda: "sct2_Concept_Snapshot"), - defaultdict(lambda: "sct2_Description_Snapshot-en-AU"), - defaultdict(lambda: "sct2_Relationship_Snapshot"), + defaultdict(lambda: "Concept_Snapshot"), + defaultdict(lambda: "Description_Snapshot-en-AU"), + defaultdict(lambda: "Relationship_Snapshot"), defaultdict(lambda: "der2_iisssccRefset_ExtendedMapSnapshot") ) @@ -157,7 +165,8 @@ def get_relationship_snapshot(self, cur_path: str) -> Optional[str]: return self.value.get_appropriate_name(self.value.relationship_snapshots, cur_path) def get_refset_terminology(self, cur_path: str) -> Optional[str]: - return self.value.get_appropriate_name(self.value.refset_snapshots, cur_path) + return self.value.get_appropriate_name(self.value.refset_snapshots, cur_path, + use_val_prefix=False) class Snomed: From 2c6c39603164cbbcd17361f66ee0c5fa269b0285 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 10 Sep 2024 12:30:46 +0100 Subject: [PATCH 06/12] CU-869574kvp: Simplify mappings somewhat more, agai. Remove some clutter by separating common suffixes for release types. --- medcat/utils/preprocess_snomed.py | 48 ++++++++++++++++--------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/medcat/utils/preprocess_snomed.py b/medcat/utils/preprocess_snomed.py index 033f8a2e8..4d56891b5 100644 --- a/medcat/utils/preprocess_snomed.py +++ b/medcat/utils/preprocess_snomed.py @@ -72,6 +72,7 @@ def __init__(self, refset_snapshots: Dict[str, Optional[str]], avoids: List[str] = ["UKClinicalRefsetsRF2_PRODUCTION"], common_key_prefix: str = "SnomedCT_", + common_key_suffix: str = "_PRODUCTION", common_val_prefix: str = "sct2_", # NOT for refset snapshot ): self.concept_snapshots = concept_snapshots @@ -80,6 +81,7 @@ def __init__(self, self.refset_snapshots = refset_snapshots self.avoids = avoids self.common_key_prefix = common_key_prefix + self.common_key_suffix = common_key_suffix self.common_val_prefix = common_val_prefix def get_appropriate_name(self, part: Dict[str, Optional[str]], cur_path: str, @@ -90,11 +92,11 @@ def get_appropriate_name(self, part: Dict[str, Optional[str]], cur_path: str, except KeyError: pass for raw_avoid in self.avoids: - avoid = self.common_key_prefix + raw_avoid + avoid = self.common_key_prefix + raw_avoid + self.common_key_suffix if avoid in cur_path: return None for raw_k, v in part.items(): - k = self.common_key_prefix + raw_k + k = self.common_key_prefix + raw_k + self.common_key_suffix if k in cur_path: return val_prefix + v return None @@ -109,43 +111,43 @@ class SupportedExtensions(Enum): ) UK = SnapshotData( { - "InternationalRF2_PRODUCTION": "Concept_Snapshot", - "UKClinicalRF2_PRODUCTION": "Concept_UKCLSnapshot", - "UKEditionRF2_PRODUCTION": "Concept_UKEDSnapshot", + "InternationalRF2": "Concept_Snapshot", + "UKClinicalRF2": "Concept_UKCLSnapshot", + "UKEditionRF2": "Concept_UKEDSnapshot", }, { - "InternationalRF2_PRODUCTION": "Description_Snapshot-en", - "UKClinicalRF2_PRODUCTION": "Description_UKCLSnapshot-en", - "UKEditionRF2_PRODUCTION": "Description_UKEDSnapshot-en", + "InternationalRF2": "Description_Snapshot-en", + "UKClinicalRF2": "Description_UKCLSnapshot-en", + "UKEditionRF2": "Description_UKEDSnapshot-en", }, { - "InternationalRF2_PRODUCTION": "Relationship_Snapshot", - "UKClinicalRF2_PRODUCTION": "Relationship_UKCLSnapshot", - "UKEditionRF2_PRODUCTION": "Relationship_UKEDSnapshot", + "InternationalRF2": "Relationship_Snapshot", + "UKClinicalRF2": "Relationship_UKCLSnapshot", + "UKEditionRF2": "Relationship_UKEDSnapshot", }, { - "InternationalRF2_PRODUCTION": None, # avoid - "UKClinicalRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKCLSnapshot", - "UKEditionRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKEDSnapshot", + "InternationalRF2": None, # avoid + "UKClinicalRF2": "der2_iisssciRefset_ExtendedMapUKCLSnapshot", + "UKEditionRF2": "der2_iisssciRefset_ExtendedMapUKEDSnapshot", } ) UK_DRUG = SnapshotData( { - "UKDrugRF2_PRODUCTION": "Concept_UKDGSnapshot", - "UKEditionRF2_PRODUCTION": "Concept_UKEDSnapshot", + "UKDrugRF2": "Concept_UKDGSnapshot", + "UKEditionRF2": "Concept_UKEDSnapshot", }, { - "UKDrugRF2_PRODUCTION": "Description_UKDGSnapshot-en", - "UKEditionRF2_PRODUCTION": "Description_UKEDSnapshot-en", + "UKDrugRF2": "Description_UKDGSnapshot-en", + "UKEditionRF2": "Description_UKEDSnapshot-en", }, { - "InternationalRF2_PRODUCTION": "Relationship_Snapshot", - "UKDrugRF2_PRODUCTION": "Relationship_UKDGSnapshot", - "UKEditionRF2_PRODUCTION": "Relationship_UKEDSnapshot", + "InternationalRF2": "Relationship_Snapshot", + "UKDrugRF2": "Relationship_UKDGSnapshot", + "UKEditionRF2": "Relationship_UKEDSnapshot", }, { - "UKDrugRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKDGSnapshot", - "UKEditionRF2_PRODUCTION": "der2_iisssciRefset_ExtendedMapUKEDSnapshot", + "UKDrugRF2": "der2_iisssciRefset_ExtendedMapUKDGSnapshot", + "UKEditionRF2": "der2_iisssciRefset_ExtendedMapUKEDSnapshot", } ) AU = SnapshotData( From b6b61962b4f9e77c9ace4faec30bef35b226ea0d Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 10 Sep 2024 17:12:11 +0100 Subject: [PATCH 07/12] CU-869574kvp: Update preprocessing. New abstraction. Use supprted extensions which describe their file formats along with bundles which give some further insight and control. --- medcat/utils/preprocess_snomed.py | 340 ++++++++++++++++++------------ 1 file changed, 204 insertions(+), 136 deletions(-) diff --git a/medcat/utils/preprocess_snomed.py b/medcat/utils/preprocess_snomed.py index 4d56891b5..5978ed2ee 100644 --- a/medcat/utils/preprocess_snomed.py +++ b/medcat/utils/preprocess_snomed.py @@ -3,9 +3,9 @@ import re import hashlib import pandas as pd -from typing import Optional, Dict, List -from collections import defaultdict -from enum import Enum +from typing import Dict, List, Optional, Tuple +from dataclasses import dataclass +from enum import Enum, auto def parse_file(filename, first_row_header=True, columns=None): @@ -64,111 +64,163 @@ def get_direct_refset_mapping(in_dict: dict) -> dict: return ret_dict -class SnapshotData: - def __init__(self, - concept_snapshots: Dict[str, Optional[str]], - description_snapshots: Dict[str, Optional[str]], - relationship_snapshots: Dict[str, Optional[str]], - refset_snapshots: Dict[str, Optional[str]], - avoids: List[str] = ["UKClinicalRefsetsRF2_PRODUCTION"], - common_key_prefix: str = "SnomedCT_", - common_key_suffix: str = "_PRODUCTION", - common_val_prefix: str = "sct2_", # NOT for refset snapshot - ): - self.concept_snapshots = concept_snapshots - self.description_snapshots = description_snapshots - self.relationship_snapshots = relationship_snapshots - self.refset_snapshots = refset_snapshots - self.avoids = avoids - self.common_key_prefix = common_key_prefix - self.common_key_suffix = common_key_suffix - self.common_val_prefix = common_val_prefix - - def get_appropriate_name(self, part: Dict[str, Optional[str]], cur_path: str, - use_val_prefix: bool = True) -> Optional[str]: - val_prefix = self.common_val_prefix if use_val_prefix else '' - try: - return val_prefix + part[cur_path] - except KeyError: - pass - for raw_avoid in self.avoids: - avoid = self.common_key_prefix + raw_avoid + self.common_key_suffix - if avoid in cur_path: - return None - for raw_k, v in part.items(): - k = self.common_key_prefix + raw_k + self.common_key_suffix - if k in cur_path: - return val_prefix + v - return None -class SupportedExtensions(Enum): - INTERNATIONAL = SnapshotData( - defaultdict(lambda: "Concept_Snapshot"), - defaultdict(lambda: "Description_Snapshot-en"), - defaultdict(lambda: "Relationship_Snapshot"), - defaultdict(lambda: "der2_iisssccRefset_ExtendedMapSnapshot") +_IGNORE_TAG = '##IGNORE-THIS##' + + +class RefSetFileType(Enum): + concept = auto() + description = auto() + relationship = auto() + refset = auto() + + +@dataclass +class FileFormatDescriptor: + concept: str + description: str + relationship: str + refset: str + common_prefix: str = "sct2_" # for concept, description, and relationship (but not refset) + + @classmethod + def ignore_all(cls) -> 'FileFormatDescriptor': + return cls(concept=_IGNORE_TAG, description=_IGNORE_TAG, + relationship=_IGNORE_TAG, refset=_IGNORE_TAG) + + def get_file_per_type(self, folder: str, file_type: RefSetFileType) -> str: + raw = self._get_raw(file_type) + name = raw if file_type == RefSetFileType.refset else self.common_prefix + raw + return os.path.join(folder, name) + + def _get_raw(self, file_type: RefSetFileType) -> str: + return getattr(self, file_type.name) + + def get_concept(self, folder: str) -> str: + return self.get_file_per_type(folder, RefSetFileType.concept) + + def get_description(self, folder: str) -> str: + return self.get_file_per_type(folder, RefSetFileType.description) + + def get_relationship(self, folder: str) -> str: + return self.get_file_per_type(folder, RefSetFileType.relationship) + + def get_refset(self, folder: str) -> str: + return self.get_file_per_type(folder, RefSetFileType.refset) + + +@dataclass +class ExtensionDescription: + exp_name_in_folder: str + exp_files: FileFormatDescriptor + + +# pattern has: EXTENSION PRODUCTION RELEASE +SNOMED_FOLDER_NAME_PATTERN = re.compile("^SnomedCT_([A-Za-z0-9]+)_([A-Za-z0-9]+)_(\d{8}T\d{6}Z$)") +PER_FILE_TYPE_PATHS = { + RefSetFileType.concept: os.path.join("Snapshot", "Terminology"), + RefSetFileType.description: os.path.join("Snapshot", "Terminology"), + RefSetFileType.relationship: os.path.join("Snapshot", "Terminology"), + RefSetFileType.refset: os.path.join("Snapshot", "Refset", "Map"), +} + + + +class SupportedExtension(Enum): + INTERNATIONAL = ExtensionDescription( + exp_name_in_folder="InternationalRF2", + exp_files=FileFormatDescriptor( + concept="Concept_Snapshot", + description="Description_Snapshot-en", + relationship="Relationship_Snapshot", + # NOTE: the below will be ignored for UK_CLIN bundle + refset="der2_iisssccRefset_ExtendedMapSnapshot" + ), ) - UK = SnapshotData( - { - "InternationalRF2": "Concept_Snapshot", - "UKClinicalRF2": "Concept_UKCLSnapshot", - "UKEditionRF2": "Concept_UKEDSnapshot", - }, - { - "InternationalRF2": "Description_Snapshot-en", - "UKClinicalRF2": "Description_UKCLSnapshot-en", - "UKEditionRF2": "Description_UKEDSnapshot-en", - }, - { - "InternationalRF2": "Relationship_Snapshot", - "UKClinicalRF2": "Relationship_UKCLSnapshot", - "UKEditionRF2": "Relationship_UKEDSnapshot", - }, - { - "InternationalRF2": None, # avoid - "UKClinicalRF2": "der2_iisssciRefset_ExtendedMapUKCLSnapshot", - "UKEditionRF2": "der2_iisssciRefset_ExtendedMapUKEDSnapshot", - } + UK_CLINICAL = ExtensionDescription( + exp_name_in_folder="UKClinicalRF2", + exp_files=FileFormatDescriptor( + concept="Concept_UKCLSnapshot", + description="Description_UKCRSnapshot-en", + relationship="Relationship_UKCLSnapshot", + refset="der2_iisssciRefset_ExtendedMapUKCLSnapshot" + ), ) - UK_DRUG = SnapshotData( - { - "UKDrugRF2": "Concept_UKDGSnapshot", - "UKEditionRF2": "Concept_UKEDSnapshot", - }, - { - "UKDrugRF2": "Description_UKDGSnapshot-en", - "UKEditionRF2": "Description_UKEDSnapshot-en", - }, - { - "InternationalRF2": "Relationship_Snapshot", - "UKDrugRF2": "Relationship_UKDGSnapshot", - "UKEditionRF2": "Relationship_UKEDSnapshot", - }, - { - "UKDrugRF2": "der2_iisssciRefset_ExtendedMapUKDGSnapshot", - "UKEditionRF2": "der2_iisssciRefset_ExtendedMapUKEDSnapshot", - } + UK_CLINICAL_REFSET = ExtensionDescription( + exp_name_in_folder="UKClinicalRefsetsRF2", + exp_files=FileFormatDescriptor.ignore_all() ) - AU = SnapshotData( - defaultdict(lambda: "Concept_Snapshot"), - defaultdict(lambda: "Description_Snapshot-en-AU"), - defaultdict(lambda: "Relationship_Snapshot"), - defaultdict(lambda: "der2_iisssccRefset_ExtendedMapSnapshot") + UK_EDITION = ExtensionDescription( + exp_name_in_folder="UKEditionRF2", + exp_files=FileFormatDescriptor( + concept="Concept_UKEDSnapshot", + description="Description_UKEDSnapshot-en", + relationship="Relationship_UKEDSnapshot", + refset="der2_iisssciRefset_ExtendedMapUKEDSnapshot" + ), + ) + UK_DRUG = ExtensionDescription( + exp_name_in_folder="UKDrugRF2", + exp_files=FileFormatDescriptor( + concept="Concept_UKDGSnapshot", + description="Description_UKDGSnapshot-en", + relationship="Relationship_UKDGSnapshot", + refset="der2_iisssciRefset_ExtendedMapUKDGSnapshot", + ), + ) + AU = ExtensionDescription( + exp_name_in_folder="InternationalRF2", + exp_files=FileFormatDescriptor( + concept="Concept_Snapshot", + description="Description_Snapshot-en-AU", + relationship="Relationship_Snapshot", + refset=_IGNORE_TAG, + ), ) - def get_concept_snapshot(self, cur_path: str) -> Optional[str]: - return self.value.get_appropriate_name(self.value.concept_snapshots, cur_path) - - def get_description_snapshot(self, cur_path: str) -> Optional[str]: - return self.value.get_appropriate_name(self.value.description_snapshots, cur_path) - def get_relationship_snapshot(self, cur_path: str) -> Optional[str]: - return self.value.get_appropriate_name(self.value.relationship_snapshots, cur_path) +@dataclass +class BundleDescriptor: + extensions: List[SupportedExtension] + ignores: Dict[RefSetFileType, List[SupportedExtension]] = {} - def get_refset_terminology(self, cur_path: str) -> Optional[str]: - return self.value.get_appropriate_name(self.value.refset_snapshots, cur_path, - use_val_prefix=False) + def has_invalid(self, ext: SupportedExtension, file_types: Tuple[RefSetFileType]) -> bool: + for ft in file_types: + if ft not in self.ignores: + continue + exts2ignore = self.ignores[ft] + if ext in exts2ignore: + return True + return False + + +class SupportedBundles(Enum): + UK_CLIN = BundleDescriptor( + extensions=[SupportedExtension.INTERNATIONAL, SupportedExtension.UK_CLINICAL, + SupportedExtension.UK_CLINICAL_REFSET, SupportedExtension.UK_EDITION], + ignores={RefSetFileType.refset: [SupportedExtension.INTERNATIONAL]} + ) + UK_DRUG_EXT = BundleDescriptor( + extensions=[SupportedExtension.UK_DRUG, SupportedExtension.UK_EDITION], + ) + + +def match_partials_with_folders(exp_names: List[str], folder_names: List[str]) -> bool: + if len(exp_names) > len(folder_names): + return False + available_folders = folder_names.copy() + for exp_name in exp_names: + found_cur_name = False + for fi, folder in enumerate(available_folders): + if exp_name in folder: + found_cur_name = True + break + if found_cur_name: + available_folders.pop(fi) + else: + return False + return True class Snomed: @@ -184,16 +236,24 @@ class Snomed: uk_drug_ext (bool, optional): Specifies whether the version is a SNOMED UK drug extension. Defaults to False. au_ext (bool, optional): Specifies whether the version is a AU release. Defaults to False. """ - SNOMED_RELEASE_PATTERN = re.compile("^SnomedCT_([A-Za-z0-9]+)_([A-Za-z0-9]+)_(\d{8}T\d{6}Z$)") NO_VERSION_DETECTED = 'N/A' def __init__(self, data_path): self.data_path = data_path + self.bundle = self._determine_bundle() self.paths, self.snomed_releases, self.exts = self._check_path_and_release() - def _set_extension(self, release: str, extension: SupportedExtensions) -> None: + def _determine_bundle(self) -> Optional[SupportedBundles]: + for bundle in SupportedBundles: + folder_names = list(os.listdir(self.data_path)) + exp_names = [ext.value.exp_name_in_folder for ext in bundle.value.extensions] + if match_partials_with_folders(exp_names, folder_names): + return bundle + return None + + def _set_extension(self, release: str, extension: SupportedExtension) -> None: self.opcs_refset_id = "1126441000000105" - if (extension in (SupportedExtensions.UK, SupportedExtensions.UK_DRUG) and + if (extension in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG) and # using lexicographical comparison below # e.g "20240101" > "20231122" results in True # yet "20231121" > "20231122" results in False @@ -204,28 +264,28 @@ def _set_extension(self, release: str, extension: SupportedExtensions) -> None: self._extension = extension @classmethod - def _determine_extension(cls, folder_path: str) -> SupportedExtensions: - uk_ext = "SnomedCT_UK" in folder_path - uk_drug_ext = uk_ext and "Drug" in folder_path - au_ext = "_AU" in folder_path - # validate - if (uk_ext or uk_drug_ext) and au_ext: + def _determine_extension(cls, folder_path: str, _group_nr: int = 1) -> SupportedExtension: + folder_basename = os.path.basename(folder_path) + m = SNOMED_FOLDER_NAME_PATTERN.match(folder_basename) + if not m: raise UnkownSnomedReleaseException( - "Cannot both be a UK and and a AU version. " - f"Got UK={uk_ext}, UK_Drug={uk_drug_ext}, AU={au_ext}") - if uk_drug_ext: - return SupportedExtensions.UK_DRUG - elif uk_ext: - return SupportedExtensions.UK - elif au_ext: - return SupportedExtensions.AU - return SupportedExtensions.INTERNATIONAL + f"Unable to determine extension for path {repr(folder_path)}. " + f"Checking against pattern {SNOMED_FOLDER_NAME_PATTERN}") + ext_str = m.group(_group_nr) + for extension in SupportedExtension: + if extension.value.exp_name_in_folder == ext_str: + return extension + ext_names_folders = ",".join([f"{ext.name} ({ext.value.exp_name_in_folder})" + for ext in SupportedExtension]) + raise UnkownSnomedReleaseException( + f"Cannot Find the extension for {folder_path}. " + f"Tried the following extensions: {ext_names_folders}") @classmethod def _determine_release(cls, folder_path: str, strict: bool = True, _group_nr: int = 3, _keep_chars: int = 8) -> str: folder_basename = os.path.basename(folder_path) - match = cls.SNOMED_RELEASE_PATTERN.match(folder_basename) + match = SNOMED_FOLDER_NAME_PATTERN.match(folder_basename) if match is None and strict: raise UnkownSnomedReleaseException(f"No version found in '{folder_path}'") elif match is None: @@ -247,10 +307,12 @@ def to_concept_df(self): df2merge = [] for i, snomed_release in enumerate(self.snomed_releases): self._set_extension(snomed_release, self.exts[i]) - contents_path = os.path.join(self.paths[i], "Snapshot", "Terminology") - concept_snapshot = self._extension.get_concept_snapshot(self.paths[i]) - description_snapshot = self._extension.get_description_snapshot(self.paths[i]) - if concept_snapshot is None: + contents_path = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept]) + concept_snapshot = self._extension.value.exp_files.get_concept(self.paths[i]) + description_snapshot = self._extension.value.exp_files.get_description(self.paths[i]) + if concept_snapshot in (None, _IGNORE_TAG) or ( + self.bundle and self.bundle.value.has_invalid( + self._extension, [RefSetFileType.concept, RefSetFileType.description])): continue for f in os.listdir(contents_path): @@ -320,10 +382,12 @@ def list_all_relationships(self): all_rela = [] for i, snomed_release in enumerate(self.snomed_releases): self._set_extension(snomed_release, self.exts[i]) - contents_path = os.path.join(self.paths[i], "Snapshot", "Terminology") - concept_snapshot = self._extension.get_concept_snapshot(self.paths[i]) - relationship_snapshot = self._extension.get_relationship_snapshot(self.paths[i]) - if concept_snapshot is None: + contents_path = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept]) + concept_snapshot = self._extension.value.exp_files.get_concept(self.paths[i]) + relationship_snapshot = self._extension.value.exp_files.get_relationship(self.paths[i]) + if concept_snapshot in (None, _IGNORE_TAG) or ( + self.bundle and self.bundle.value.has_invalid( + self._extension, [RefSetFileType.concept, RefSetFileType.description])): continue for f in os.listdir(contents_path): @@ -354,10 +418,12 @@ def relationship2json(self, relationshipcode, output_jsonfile): output_dict = {} for i, snomed_release in enumerate(self.snomed_releases): self._set_extension(snomed_release, self.exts[i]) - contents_path = os.path.join(self.paths[i], "Snapshot", "Terminology") - concept_snapshot = self._extension.get_concept_snapshot(self.paths[i]) - relationship_snapshot = self._extension.get_relationship_snapshot(self.paths[i]) - if concept_snapshot is None: + contents_path = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept]) + concept_snapshot = self._extension.value.exp_files.get_concept(self.paths[i]) + relationship_snapshot = self._extension.value.exp_files.get_relationship(self.paths[i]) + if concept_snapshot in (None, _IGNORE_TAG) or ( + self.bundle and self.bundle.value.has_invalid( + self._extension, [RefSetFileType.concept, RefSetFileType.description])): continue for f in os.listdir(contents_path): @@ -391,7 +457,7 @@ def map_snomed2icd10(self): dict: A dictionary containing the SNOMED CT to ICD-10 mappings including metadata. """ snomed2icd10df = self._map_snomed2refset() - if self._extension in (SupportedExtensions.UK, SupportedExtensions.UK_DRUG): + if self._extension in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG): return self._refset_df2dict(snomed2icd10df[0]) else: return self._refset_df2dict(snomed2icd10df) @@ -409,7 +475,7 @@ def map_snomed2opcs4(self) -> dict: Returns: dict: A dictionary containing the SNOMED CT to OPCS-4 mappings including metadata. """ - if self._extension not in (SupportedExtensions.UK, SupportedExtensions.UK_DRUG): + if self._extension not in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG): raise AttributeError( "OPCS-4 mapping does not exist in this edition") snomed2opcs4df = self._map_snomed2refset()[1] @@ -479,9 +545,11 @@ def _map_snomed2refset(self): dfs2merge = [] for i, snomed_release in enumerate(self.snomed_releases): self._set_extension(snomed_release, self.exts[i]) - refset_terminology = f'{self.paths[i]}/Snapshot/Refset/Map' - icd10_ref_set = self._extension.get_refset_terminology(self.paths[i]) - if icd10_ref_set is None: + refset_terminology = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.refset]) + icd10_ref_set = self._extension.value.exp_files.get_refset(self.paths[i]) + if icd10_ref_set in (None, _IGNORE_TAG) or ( + self.bundle and self.bundle.value.has_invalid( + self._extension, [RefSetFileType.concept, RefSetFileType.description])): continue for f in os.listdir(refset_terminology): m = re.search(f'{icd10_ref_set}'+r'_(.*)_\d*.txt', f) @@ -495,7 +563,7 @@ def _map_snomed2refset(self): dfs2merge.append(icd_mappings) mapping_df = pd.concat(dfs2merge) del dfs2merge - if self._extension in (SupportedExtensions.UK, SupportedExtensions.UK_DRUG): + if self._extension in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG): opcs_df = mapping_df[mapping_df['refsetId'] == self.opcs_refset_id] icd10_df = mapping_df[mapping_df['refsetId'] == '999002271000000101'] From 6b572647b02e3e304914c1dfd60a64e2a32788ba Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 10 Sep 2024 17:28:25 +0100 Subject: [PATCH 08/12] CU-869574kvp: Fix data class init --- medcat/utils/preprocess_snomed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat/utils/preprocess_snomed.py b/medcat/utils/preprocess_snomed.py index 5978ed2ee..2f74ee6d3 100644 --- a/medcat/utils/preprocess_snomed.py +++ b/medcat/utils/preprocess_snomed.py @@ -4,7 +4,7 @@ import hashlib import pandas as pd from typing import Dict, List, Optional, Tuple -from dataclasses import dataclass +from dataclasses import dataclass, field from enum import Enum, auto @@ -183,7 +183,7 @@ class SupportedExtension(Enum): @dataclass class BundleDescriptor: extensions: List[SupportedExtension] - ignores: Dict[RefSetFileType, List[SupportedExtension]] = {} + ignores: Dict[RefSetFileType, List[SupportedExtension]] = field(default_factory=dict) def has_invalid(self, ext: SupportedExtension, file_types: Tuple[RefSetFileType]) -> bool: for ft in file_types: From e317d7c1a535577aecc5464db2efb34f73f3afd7 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 10 Sep 2024 17:30:39 +0100 Subject: [PATCH 09/12] CU-869574kvp: Fix issue with file paths --- medcat/utils/preprocess_snomed.py | 35 +++++++++++++++---------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/medcat/utils/preprocess_snomed.py b/medcat/utils/preprocess_snomed.py index 2f74ee6d3..ac3b5770c 100644 --- a/medcat/utils/preprocess_snomed.py +++ b/medcat/utils/preprocess_snomed.py @@ -89,25 +89,24 @@ def ignore_all(cls) -> 'FileFormatDescriptor': return cls(concept=_IGNORE_TAG, description=_IGNORE_TAG, relationship=_IGNORE_TAG, refset=_IGNORE_TAG) - def get_file_per_type(self, folder: str, file_type: RefSetFileType) -> str: + def get_file_per_type(self, file_type: RefSetFileType) -> str: raw = self._get_raw(file_type) - name = raw if file_type == RefSetFileType.refset else self.common_prefix + raw - return os.path.join(folder, name) + return raw if file_type == RefSetFileType.refset else self.common_prefix + raw def _get_raw(self, file_type: RefSetFileType) -> str: return getattr(self, file_type.name) - def get_concept(self, folder: str) -> str: - return self.get_file_per_type(folder, RefSetFileType.concept) + def get_concept(self) -> str: + return self.get_file_per_type(RefSetFileType.concept) - def get_description(self, folder: str) -> str: - return self.get_file_per_type(folder, RefSetFileType.description) + def get_description(self) -> str: + return self.get_file_per_type(RefSetFileType.description) - def get_relationship(self, folder: str) -> str: - return self.get_file_per_type(folder, RefSetFileType.relationship) + def get_relationship(self) -> str: + return self.get_file_per_type(RefSetFileType.relationship) - def get_refset(self, folder: str) -> str: - return self.get_file_per_type(folder, RefSetFileType.refset) + def get_refset(self) -> str: + return self.get_file_per_type(RefSetFileType.refset) @dataclass @@ -308,8 +307,8 @@ def to_concept_df(self): for i, snomed_release in enumerate(self.snomed_releases): self._set_extension(snomed_release, self.exts[i]) contents_path = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept]) - concept_snapshot = self._extension.value.exp_files.get_concept(self.paths[i]) - description_snapshot = self._extension.value.exp_files.get_description(self.paths[i]) + concept_snapshot = self._extension.value.exp_files.get_concept() + description_snapshot = self._extension.value.exp_files.get_description() if concept_snapshot in (None, _IGNORE_TAG) or ( self.bundle and self.bundle.value.has_invalid( self._extension, [RefSetFileType.concept, RefSetFileType.description])): @@ -383,8 +382,8 @@ def list_all_relationships(self): for i, snomed_release in enumerate(self.snomed_releases): self._set_extension(snomed_release, self.exts[i]) contents_path = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept]) - concept_snapshot = self._extension.value.exp_files.get_concept(self.paths[i]) - relationship_snapshot = self._extension.value.exp_files.get_relationship(self.paths[i]) + concept_snapshot = self._extension.value.exp_files.get_concept() + relationship_snapshot = self._extension.value.exp_files.get_relationship() if concept_snapshot in (None, _IGNORE_TAG) or ( self.bundle and self.bundle.value.has_invalid( self._extension, [RefSetFileType.concept, RefSetFileType.description])): @@ -419,8 +418,8 @@ def relationship2json(self, relationshipcode, output_jsonfile): for i, snomed_release in enumerate(self.snomed_releases): self._set_extension(snomed_release, self.exts[i]) contents_path = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept]) - concept_snapshot = self._extension.value.exp_files.get_concept(self.paths[i]) - relationship_snapshot = self._extension.value.exp_files.get_relationship(self.paths[i]) + concept_snapshot = self._extension.value.exp_files.get_concept() + relationship_snapshot = self._extension.value.exp_files.get_relationship() if concept_snapshot in (None, _IGNORE_TAG) or ( self.bundle and self.bundle.value.has_invalid( self._extension, [RefSetFileType.concept, RefSetFileType.description])): @@ -546,7 +545,7 @@ def _map_snomed2refset(self): for i, snomed_release in enumerate(self.snomed_releases): self._set_extension(snomed_release, self.exts[i]) refset_terminology = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.refset]) - icd10_ref_set = self._extension.value.exp_files.get_refset(self.paths[i]) + icd10_ref_set = self._extension.value.exp_files.get_refset() if icd10_ref_set in (None, _IGNORE_TAG) or ( self.bundle and self.bundle.value.has_invalid( self._extension, [RefSetFileType.concept, RefSetFileType.description])): From 35c78cd32b325e4579f7c10fb0370a3f6b4c1a9f Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 10 Sep 2024 17:33:47 +0100 Subject: [PATCH 10/12] CU-869574kvp: Fix a UK Clinical description file path --- medcat/utils/preprocess_snomed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat/utils/preprocess_snomed.py b/medcat/utils/preprocess_snomed.py index ac3b5770c..cd6626feb 100644 --- a/medcat/utils/preprocess_snomed.py +++ b/medcat/utils/preprocess_snomed.py @@ -141,7 +141,7 @@ class SupportedExtension(Enum): exp_name_in_folder="UKClinicalRF2", exp_files=FileFormatDescriptor( concept="Concept_UKCLSnapshot", - description="Description_UKCRSnapshot-en", + description="Description_UKCLSnapshot-en", relationship="Relationship_UKCLSnapshot", refset="der2_iisssciRefset_ExtendedMapUKCLSnapshot" ), From b7ef2b66bf8404610de25eb6d4db15c4a5bfaf09 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 11 Sep 2024 10:14:25 +0100 Subject: [PATCH 11/12] CU-869574kvp: Add (optional) 2nd part of folder name to extension. For AU models, the folder name seems to be 'SnomedCT_Release_AU1000036_20240630T120000Z', so the 1st part is just 'Release' and the 2nd part is indicative of AU. Add usage of this where relevant. --- medcat/utils/preprocess_snomed.py | 50 +++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/medcat/utils/preprocess_snomed.py b/medcat/utils/preprocess_snomed.py index cd6626feb..60bbb6994 100644 --- a/medcat/utils/preprocess_snomed.py +++ b/medcat/utils/preprocess_snomed.py @@ -113,6 +113,7 @@ def get_refset(self) -> str: class ExtensionDescription: exp_name_in_folder: str exp_files: FileFormatDescriptor + exp_2nd_part_in_folder: Optional[str] = None # pattern has: EXTENSION PRODUCTION RELEASE @@ -169,7 +170,8 @@ class SupportedExtension(Enum): ), ) AU = ExtensionDescription( - exp_name_in_folder="InternationalRF2", + exp_name_in_folder="Release", + exp_2nd_part_in_folder="AU1000036", exp_files=FileFormatDescriptor( concept="Concept_Snapshot", description="Description_Snapshot-en-AU", @@ -205,16 +207,24 @@ class SupportedBundles(Enum): ) -def match_partials_with_folders(exp_names: List[str], folder_names: List[str]) -> bool: +def match_partials_with_folders(exp_names: List[Tuple[str, Optional[str]]], + folder_names: List[str], + _group_nr1: int = 1, _group_nr2: int = 2) -> bool: if len(exp_names) > len(folder_names): return False - available_folders = folder_names.copy() - for exp_name in exp_names: + available_folders = [os.path.basename(f) for f in folder_names] + for exp_name, exp_name_p2 in exp_names: found_cur_name = False for fi, folder in enumerate(available_folders): - if exp_name in folder: - found_cur_name = True - break + m = SNOMED_FOLDER_NAME_PATTERN.match(folder) + if not m: + continue + if m.group(_group_nr1) != exp_name: + continue + if exp_name_p2 and m.group(_group_nr2) != exp_name_p2: + continue + found_cur_name = True + break if found_cur_name: available_folders.pop(fi) else: @@ -239,13 +249,17 @@ class Snomed: def __init__(self, data_path): self.data_path = data_path - self.bundle = self._determine_bundle() + self.bundle = self._determine_bundle(self.data_path) self.paths, self.snomed_releases, self.exts = self._check_path_and_release() - def _determine_bundle(self) -> Optional[SupportedBundles]: + @classmethod + def _determine_bundle(cls, data_path) -> Optional[SupportedBundles]: + if not os.path.exists(data_path) or not os.path.isdir(data_path): + return None for bundle in SupportedBundles: - folder_names = list(os.listdir(self.data_path)) - exp_names = [ext.value.exp_name_in_folder for ext in bundle.value.extensions] + folder_names = list(os.listdir(data_path)) + exp_names = [(ext.value.exp_name_in_folder, ext.value.exp_2nd_part_in_folder) + for ext in bundle.value.extensions] if match_partials_with_folders(exp_names, folder_names): return bundle return None @@ -263,17 +277,23 @@ def _set_extension(self, release: str, extension: SupportedExtension) -> None: self._extension = extension @classmethod - def _determine_extension(cls, folder_path: str, _group_nr: int = 1) -> SupportedExtension: + def _determine_extension(cls, folder_path: str, + _group_nr1: int = 1, _group_nr2: int = 2) -> SupportedExtension: folder_basename = os.path.basename(folder_path) m = SNOMED_FOLDER_NAME_PATTERN.match(folder_basename) if not m: raise UnkownSnomedReleaseException( f"Unable to determine extension for path {repr(folder_path)}. " f"Checking against pattern {SNOMED_FOLDER_NAME_PATTERN}") - ext_str = m.group(_group_nr) + ext_str = m.group(_group_nr1) + ext_str2 = m.group(_group_nr2) for extension in SupportedExtension: - if extension.value.exp_name_in_folder == ext_str: - return extension + if extension.value.exp_name_in_folder != ext_str: + continue + if (extension.value.exp_2nd_part_in_folder and + extension.value.exp_2nd_part_in_folder != ext_str2): + continue + return extension ext_names_folders = ",".join([f"{ext.name} ({ext.value.exp_name_in_folder})" for ext in SupportedExtension]) raise UnkownSnomedReleaseException( From 37b5abeccefa486d2a527658c6bd2a2339cd0319 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 11 Sep 2024 10:15:49 +0100 Subject: [PATCH 12/12] CU-869574kvp: Fix preprocessing tests. Add patch for files/folders where applicable. Change the paths of attributes where applicable. --- tests/utils/test_preprocess_snomed.py | 60 +++++++++++++++++++++------ 1 file changed, 48 insertions(+), 12 deletions(-) diff --git a/tests/utils/test_preprocess_snomed.py b/tests/utils/test_preprocess_snomed.py index 535618f0f..a133acdf8 100644 --- a/tests/utils/test_preprocess_snomed.py +++ b/tests/utils/test_preprocess_snomed.py @@ -1,8 +1,11 @@ import os from typing import Dict +import contextlib + from medcat.utils import preprocess_snomed import unittest +from unittest.mock import patch EXAMPLE_REFSET_DICT: Dict = { @@ -46,22 +49,51 @@ def test_example_no_codfe_fails(self): with self.assertRaises(KeyError): preprocess_snomed.get_direct_refset_mapping(EXAMPLE_REFSET_DICT_NO_CODE) + EXAMPLE_SNOMED_PATH_OLD = "SnomedCT_InternationalRF2_PRODUCTION_20220831T120000Z" EXAMPLE_SNOMED_PATH_NEW = "SnomedCT_UKClinicalRF2_PRODUCTION_20231122T000001Z" -class TestSnomedVersionsOPCS4(unittest.TestCase): +@contextlib.contextmanager +def patch_fake_files(path: str, subfiles: list = [], + subdirs: list = ["Snapshot"]): + def cur_listdir(file_path: str, *args, **kwargs) -> list: + if file_path == path: + return subfiles + subdirs + for sd in subdirs: + subdir = os.path.join(path, sd) + if subdir == path: + return [] + raise FileNotFoundError(path) - def test_old_gets_old_OPCS4_mapping_nonuk_ext(self): - snomed = preprocess_snomed.Snomed(EXAMPLE_SNOMED_PATH_OLD, uk_ext=False) - self.assertEqual(snomed.opcs_refset_id, "1126441000000105") + def cur_isfile(file_path: str, *args, **kwargs) -> bool: + print("CUR isfile", file_path) + return file_path == path or file_path in [os.path.join(path, subfiles)] + + def cur_isdir(file_path: str, *args, **kwrags) -> bool: + print("CUR isdir", file_path) + return file_path == path or file_path in [os.path.join(path, subdirs)] + + with patch("os.listdir", new=cur_listdir): + with patch("os.path.isfile", new=cur_isfile): + with patch("os.path.isdir", new=cur_isdir): + yield + + +class TestSnomedVersionsOPCS4(unittest.TestCase): - def test_old_gets_old_OPCS4_mapping_uk_ext(self): - snomed = preprocess_snomed.Snomed(EXAMPLE_SNOMED_PATH_OLD, uk_ext=True) + def test_old_gets_old_OPCS4_mapping(self): + with patch_fake_files(EXAMPLE_SNOMED_PATH_OLD): + snomed = preprocess_snomed.Snomed(EXAMPLE_SNOMED_PATH_OLD) + snomed._set_extension(snomed._determine_release(EXAMPLE_SNOMED_PATH_OLD), + snomed._determine_extension(EXAMPLE_SNOMED_PATH_OLD)) self.assertEqual(snomed.opcs_refset_id, "1126441000000105") - def test_new_gets_new_OCPS4_mapping_uk_ext(self): - snomed = preprocess_snomed.Snomed(EXAMPLE_SNOMED_PATH_NEW, uk_ext=True) + def test_new_gets_new_OCPS4_mapping(self): + with patch_fake_files(EXAMPLE_SNOMED_PATH_NEW): + snomed = preprocess_snomed.Snomed(EXAMPLE_SNOMED_PATH_NEW) + snomed._set_extension(snomed._determine_release(EXAMPLE_SNOMED_PATH_NEW), + snomed._determine_extension(EXAMPLE_SNOMED_PATH_NEW)) self.assertEqual(snomed.opcs_refset_id, "1382401000000109") @@ -86,7 +118,10 @@ def _pathify(self, in_list: list) -> list: return [os.path.join(self.PATH, folder) for folder in in_list] def assert_got_version(self, snomed: preprocess_snomed.Snomed, raw_name: str): - rel = snomed.release + rel_list = snomed.snomed_releases + self.assertIsInstance(rel_list, list) + self.assertEqual(len(rel_list), 1) + rel = rel_list[0] self.assertIsInstance(rel, str) self.assertIn(rel, raw_name) self.assertEqual(rel, raw_name[-16:-8]) @@ -94,7 +129,8 @@ def assert_got_version(self, snomed: preprocess_snomed.Snomed, raw_name: str): def assert_all_work(self, all_paths: list): for path in all_paths: with self.subTest(f"Rrelease name: {path}"): - snomed = preprocess_snomed.Snomed(path) + with patch_fake_files(path): + snomed = preprocess_snomed.Snomed(path) self.assert_got_version(snomed, path) def test_gets_model_form_basename(self): @@ -123,8 +159,8 @@ def test_fails_on_incorrect_paths_strict(self): def assert_all_get_no_version(self, folder_paths: list): for folder_path in folder_paths: with self.subTest(f"Folder: {folder_path}"): - snomed = preprocess_snomed.Snomed(folder_path) - self.assertEqual(snomed.release, preprocess_snomed.Snomed.NO_VERSION_DETECTED) + det_rel = preprocess_snomed.Snomed._determine_release(folder_path, strict=False) + self.assertEqual(det_rel, preprocess_snomed.Snomed.NO_VERSION_DETECTED) def test_gets_no_version_incorrect_names_nonstrict(self): self.assert_all_get_no_version(self.FAILING_BASE_NAMES)