From 4d4db14810a34b41f768a24649817f8ddea821bf Mon Sep 17 00:00:00 2001 From: vvcb Date: Sun, 12 Feb 2023 20:35:23 +0000 Subject: [PATCH] Bump version 0.3.0 Major update to validaion and feature_engineering - Allow ignoring columns and updating column properties in validate_dataframe - Helps address significant DQ issues with SNOMED codes - Update feature maps to use category: [list of codes] - Use generic function in emergency_care_features.py to create features - Multiple strategies for dealing with missing and unmapped codes - Update docs to reflect changes --- README.md | 4 +- avoidable_admissions/__init__.py | 2 +- avoidable_admissions/data/validate.py | 155 ++++- .../features/emergency_care_features.py | 90 +-- avoidable_admissions/features/feature_maps.py | 624 +++++++++++------- docs/features.md | 17 + docs/index.md | 4 +- docs/validation.md | 4 +- 8 files changed, 583 insertions(+), 317 deletions(-) diff --git a/README.md b/README.md index daba89c..f1e104a 100644 --- a/README.md +++ b/README.md @@ -21,10 +21,10 @@ pip install "avoidable_admissions @ git+https://github.com/LTHTR-DST/hdruk_avoid Additional installation options are described in the [documentation](https://lthtr-dst.github.io/hdruk_avoidable_admissions/). -Replace `` with the latest release version e.g. `v0.2.1-alpha`. +Replace `` with the latest release version e.g. `v0.3.0`. List of releases can be found here - . -Omit `` to install the latest code in the repo. +Omit `@` to install the latest code in the repo. ## Quickstart diff --git a/avoidable_admissions/__init__.py b/avoidable_admissions/__init__.py index 3a3c3f4..a554b00 100644 --- a/avoidable_admissions/__init__.py +++ b/avoidable_admissions/__init__.py @@ -2,4 +2,4 @@ __all__ = ["data", "features", "models", "utils", "visualization"] -__version__ = "0.2.2" +__version__ = "0.3.0" diff --git a/avoidable_admissions/data/validate.py b/avoidable_admissions/data/validate.py index 2ccc384..bfc029e 100644 --- a/avoidable_admissions/data/validate.py +++ b/avoidable_admissions/data/validate.py @@ -1,6 +1,5 @@ import warnings -from contextlib import nullcontext -from datetime import date, datetime +from datetime import datetime from typing import Tuple import numpy as np @@ -23,7 +22,7 @@ class AdmittedCareEpisodeSchema(pa.SchemaModel): visit_id: Series[str] = pa.Field(nullable=False, unique=True, coerce=True) # Ensure this has been pseudonymised appropriately. - patient_id: Series[str] = pa.Field(nullable=False) + patient_id: Series[str] = pa.Field(nullable=False, coerce=True) gender: Series[str] = pa.Field( description=nhsdd.gender["url"], @@ -221,7 +220,7 @@ class EmergencyCareEpisodeSchema(pa.SchemaModel): visit_id: Series[str] = pa.Field(nullable=False, unique=True, coerce=True) - patient_id: Series[str] = pa.Field(nullable=False) + patient_id: Series[str] = pa.Field(nullable=False, coerce=True) gender: Series[str] = pa.Field( description=nhsdd.gender["url"], @@ -394,7 +393,7 @@ class Config: ) ], ), - "edtreat_[0-9]{2}": pa.Column( + "edtreat_[0-9]{2}$": pa.Column( description="https://www.datadictionary.nhs.uk/data_elements/emergency_care_procedure__snomed_ct_.html", dtype=np.int64, nullable=True, @@ -430,6 +429,26 @@ class Config: nullable=True, checks=[pa.Check.isin(set(feature_maps.ethnos.values()))], ), + "edarrival_dayofweek": pa.Column( + str, + nullable=True, + checks=[ + pa.Check.isin( + [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sunday", + ] + ) + ], + ), + "edarrival_hourofday": pa.Column( + int, nullable=True, checks=[pa.Check.ge(0), pa.Check.le(23)] + ), "accommodationstatus_cat": pa.Column( str, nullable=True, @@ -539,23 +558,14 @@ def validate_dataframe( schema (pa.DataFrameSchema): Pandera schema to validate against kwargs: The following keyword arguments are currently supported start_date (datetime): Study start date (inclusive) - end_date (datetime): Study end date (inclusive) - + end_date (datetime): Study end date (excluded) + ignore_cols (list): Columns to ignore during validation checks. + update_cols (dict[str:dict]): Dictionary of column:properties to update schema. Returns: _Good_ and _Bad_ dataframes. See example below. - ## Customising Study Dates - - As a default, the study dates specified in the initial protocol will be used (`admidate>=datetime(2021,11,1) and admidate=start_date` and `admidate=datetime(2021,11,1) and admidate=start_date` and `admidate pd.Series: + # if value is in replacements, keep the value, else use `other` for all others + # then use replacements to assign the other categories + + data_cat = ( + data.where(data.isin(replacements), other).replace(replacements).astype(str) + ) + + return data_cat + + def _age(df: pd.DataFrame) -> pd.DataFrame: age_labels = feature_maps.age_labels age_bins = feature_maps.age_bins df["activage_cat"] = pd.cut(df.activage, bins=age_bins, labels=age_labels) + df["activage_cat"] = df["activage_cat"].astype(str) return df def _gender(df: pd.DataFrame) -> pd.DataFrame: - df["gender_cat"] = df.gender.astype(str).replace(feature_maps.gender) + df["gender_cat"] = replace_values(df.gender.astype(str), feature_maps.gender) return df def _ethnos(df: pd.DataFrame) -> pd.DataFrame: - df["ethnos_cat"] = df.ethnos.replace(feature_maps.ethnos) + df["ethnos_cat"] = replace_values(df.ethnos, feature_maps.ethnos) return df def _accommodationstatus(df: pd.DataFrame) -> pd.DataFrame: - df["accommodationstatus_cat"] = df.accommodationstatus.replace( - feature_maps.accommodationstatus + df["accommodationstatus_cat"] = replace_values( + df.accommodationstatus, feature_maps.accommodationstatus ) + + return df + + +def _edarrivaldatetime(df: pd.DataFrame) -> pd.DataFrame: + + df.edarrivaldatetime = pd.to_datetime(df.edarrivaldatetime) + df["edarrival_dayofweek"] = df.edarrivaldatetime.dt.strftime("%A") + df["edarrival_hourofday"] = df.edarrivaldatetime.dt.hour + return df def _edarivalemode(df: pd.DataFrame) -> pd.DataFrame: - df["edarrivalmode_cat"] = df.edarrivalmode.replace(feature_maps.edarrivalmode) + df["edarrivalmode_cat"] = replace_values( + df.edarrivalmode, feature_maps.edarrivalmode + ) return df def _edattendsource(df: pd.DataFrame) -> pd.DataFrame: - df["edattendsource_cat"] = df.edattendsource.replace(feature_maps.edattendsource) + df["edattendsource_cat"] = replace_values( + df.edattendsource, feature_maps.edattendsource + ) return df def _edacuity(df: pd.DataFrame) -> pd.DataFrame: - df["edacuity_cat"] = df.edacuity.replace(feature_maps.edacuity) + df["edacuity_cat"] = replace_values(df.edacuity, feature_maps.edacuity) return df @@ -63,11 +91,7 @@ def _edinvest(df: pd.DataFrame) -> pd.DataFrame: for col in cols: - # if value is in replacements, keep the value, else use 'Urgent' for all others - # then use replacements to assign the other categories - df[col + "_cat"] = ( - df[col].where(df[col].isin(replacements), "Urgent").replace(replacements) - ) + df[col + "_cat"] = replace_values(df[col], replacements, "Urgent") return df @@ -78,24 +102,17 @@ def _edtreat(df: pd.DataFrame) -> pd.DataFrame: replacements = feature_maps.edtreat for col in cols: - # if value is in replacements, keep the value, else use 'Urgent' for all others - # then use replacements to assign the other categories - df[col + "_cat"] = ( - df[col].where(df[col].isin(replacements), "Urgent").replace(replacements) - ) + df[col + "_cat"] = replace_values(df[col], replacements, "Urgent") return df def _eddiag_seasonal(df: pd.DataFrame) -> pd.DataFrame: # Only use first diagnosis recorded (eddiag_01) to record seasonal diagnosis - replacements = feature_maps.eddiag_seasonal - # if value is in replacements, keep the value, else use 'nan' for all others - # then use replacements to assign the other categories - df["eddiag_seasonal_cat"] = df.eddiag_01.where( - df.eddiag_01.isin(replacements), np.nan - ).replace(replacements) + df["eddiag_seasonal_cat"] = replace_values( + df.eddiag_01, feature_maps.eddiag_seasonal + ) return df @@ -103,31 +120,26 @@ def _eddiag_seasonal(df: pd.DataFrame) -> pd.DataFrame: def _edattenddispatch(df: pd.DataFrame) -> pd.DataFrame: # Discharge Destination - df["edattenddispatch_cat"] = df.edattenddispatch.replace( - feature_maps.edattenddispatch + df["edattenddispatch_cat"] = replace_values( + df.edattenddispatch, feature_maps.edattenddispatch ) + return df def _edrefservice(df: pd.DataFrame) -> pd.DataFrame: - replacements = feature_maps.edrefservice - - df["edrefservice_cat"] = df.edrefservice.where( - df.edrefservice.isin(replacements), "Other" - ).replace(replacements) + df["edrefservice_cat"] = replace_values( + df.edrefservice, feature_maps.edrefservice, "Other" + ) return df def _eddiagqual(df: pd.DataFrame) -> pd.DataFrame: # Only applicable to eddiag_01 - # This deviates from the spec by assigning nan to values not in the spec - replacements = feature_maps.eddiagqual - df["eddiagqual_01_cat"] = df.eddiagqual_01.where( - df.eddiagqual_01.isin(replacements), np.nan - ).replace(replacements) + df["eddiagqual_01_cat"] = replace_values(df.eddiagqual_01, feature_maps.eddiagqual) return df @@ -137,17 +149,14 @@ def _acsc_code(df: pd.DataFrame) -> pd.DataFrame: # TODO: This section needs manual review of a good sample size to ensure it works acsc_mapping = feature_maps.load_ed_acsc_mapping() - df["eddiag_01_acsc"] = df.eddiag_01.replace(acsc_mapping) - df.eddiag_01_acsc = df.eddiag_01_acsc.where( - df.eddiag_01_acsc.isin(set(acsc_mapping.values())), "-" - ) + df["eddiag_01_acsc"] = replace_values(df.eddiag_01, acsc_mapping) return df def _disstatus(df: pd.DataFrame) -> pd.DataFrame: - df["disstatus_cat"] = df.disstatus.replace(feature_maps.disstatus) + df["disstatus_cat"] = replace_values(df.disstatus, feature_maps.disstatus) return df @@ -161,6 +170,7 @@ def build_all(df: pd.DataFrame) -> pd.DataFrame: .pipe(_disstatus) .pipe(_edacuity) .pipe(_edarivalemode) + .pipe(_edarrivaldatetime) .pipe(_edattenddispatch) .pipe(_edattendsource) .pipe(_eddiag_seasonal) diff --git a/avoidable_admissions/features/feature_maps.py b/avoidable_admissions/features/feature_maps.py index c28229b..33a2fa9 100644 --- a/avoidable_admissions/features/feature_maps.py +++ b/avoidable_admissions/features/feature_maps.py @@ -4,6 +4,11 @@ import numpy as np import pandas as pd +from avoidable_admissions.data import nhsdd_snomed + +# to prevent accidentally removing nhsdd_snomed import which is used in eval +_ = nhsdd_snomed.__doc__ + age_labels = [ "18-19", "20 - 24", @@ -24,7 +29,6 @@ age_bins = age_bins = [17, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 130] - gender = { "1": "Male", "2": "Female", @@ -55,243 +59,6 @@ "99": "Not known", } -accomodationstatus = { - 0: np.nan, - 1064831000000106: "Unknown", - 1064841000000102: "Unknown", - 1066881000000100: "Unknown", - 160734000: "Yes", - 224221006: "No", - 224225002: "No", - 224231004: "No", - 32911000: "No", - 394923006: "No", - 414418009: "No", -} - -edarrivalmode = { - 0: np.nan, - 1047991000000102: "Sheffield-ToDo", - 1048001000000106: "Sheffield-ToDo", - 1048021000000102: "Sheffield-ToDo", - 1048031000000100: "Sheffield-ToDo", - 1048041000000109: "Sheffield-ToDo", - 1048051000000107: "Sheffield-ToDo", - 1048061000000105: "Walk-In", - 1048071000000103: "Walk-In", - 1048081000000101: "Sheffield-ToDo", -} - -edattendsource = { - 0: np.nan, - 1052681000000105: "Community", - 1065391000000104: "Personal", - 1065401000000101: "Community", - 1065991000000100: "Community", - 1066001000000101: "Community", - 1066011000000104: "Community", - 1066021000000105: "Emergency Services", - 1066031000000107: "Emergency Services", - 1066041000000103: "Emergency Services", - 1066051000000100: "Emergency Services", - 1066061000000102: "Emergency Services", - 1066431000000102: "Hospital", - 1066441000000106: "Hospital", - 1077191000000103: "Community", - 1077201000000101: "Community", - 1077211000000104: "Community", - 1077761000000105: "Community", - 1079521000000104: "Hospital", - 1082331000000106: "Primary Care", # OOH - 166941000000106: "Primary Care", - 183877003: "Private Referral", # from refset - 185363009: "Community", - 185366001: "Community", - 185368000: "Community", - 185369008: "Community", - 198261000000104: "Emergency Services", - 276491000: "Primary Care", - 315261000000101: "Personal", - 507291000000100: "Personal", - 835091000000109: "Hospital", - 835101000000101: "Hospital", - 877171000000103: "Community", - 879591000000102: "Primary Care", - 889801000000100: "Emergency Services", -} - -edacuity = { - 0: np.nan, - 1064891000000107: "1 - Immediate care level emergency care", - 1064901000000108: "3 - Urgent level emergency care", - 1064911000000105: "2 - Very urgent level emergency care", - 1077241000000103: "4 - Standard level emergency care", - 1077251000000100: "5 - Low acuity level emergency care", -} - -edinvest = { - 0: np.nan, - 1088291000000101: np.nan, - 167252002: "Non-urgent", - 27171005: "Non-urgent", - 53115007: "Non-urgent", - 67900009: "Non-urgent", -} - -edtreat = { - 0: np.nan, - 183964008: np.nan, - 266712008: "Non-urgent", - 413334001: "Non-urgent", - 81733005: "Non-urgent", -} - -eddiag_seasonal = { - 0: np.nan, - 12295008: "Chronic disease exacerbation", - 1325161000000102: "Respiratory infection", - 1325171000000109: "Respiratory infection", - 1325181000000106: "Respiratory infection", - 13645005: "Chronic disease exacerbation", - 195951007: "Chronic disease exacerbation", - 195967001: "Chronic disease exacerbation", - 205237003: "Respiratory infection", - 233604007: "Respiratory infection", - 278516003: "Respiratory infection", - 36971009: "Respiratory infection", - 50417007: "Respiratory infection", - 54150009: "Respiratory infection", - 6142004: "Respiratory infection", - 62994001: "Respiratory infection", - 80384002: "Respiratory infection", - 90176007: "Respiratory infection", -} - -eddiagqual = { - 415684004: "Suspected", - 410605003: "Confirmed", -} - -edattenddispatch = { - 0: np.nan, - 1066331000000109: "Ambulatory / Short Stay", - 1066341000000100: "Ambulatory / Short Stay", - 1066351000000102: "Ambulatory / Short Stay", - 1066361000000104: "Admitted", - 1066371000000106: "Admitted", - 1066381000000108: "Admitted", - 1066391000000105: "Admitted", - 1066401000000108: "Admitted", - 183919006: "Transfer", - 19712007: "Transfer", - 305398007: "Died", - 306689006: "Discharged", - 306691003: "Discharged", - 306694006: "Discharged", - 306705005: "Discharged", - 306706006: "Admitted", - 50861005: "Discharged", -} - -edrefservice = { - 0: np.nan, - 1064851000000104: "Medical", - 183516009: "Medical", - 183518005: "Medical", - 183519002: "Medical", - 183521007: "Medical", - 183522000: "Medical", - 183523005: "Medical", - 183524004: "Psychiatric", - 183542009: "Surgical", - 183543004: "Surgical", - 183544005: "Surgical", - 183545006: "Surgical", - 183546007: "Surgical", - 183548008: "ObGyn", - 183549000: "ObGyn", - 183561008: "Local Medical", - 202291000000107: "Psychiatric", - 247541000000106: "Community / OPD", - 276490004: "Local Medical", - 306107006: "Critical Care", - 306111000: "Medical", - 306114008: "Medical", - 306118006: "Medical", - 306123006: "Medical", - 306124000: "Medical", - 306125004: "Medical", - 306127007: "Medical", - 306129005: "Community / OPD", - 306136006: "Psychiatric", - 306138007: "Psychiatric", - 306140002: "Medical", - 306148009: "Medical", - 306152009: "Local Medical", - 306182003: "Surgical", - 306184002: "Surgical", - 306198005: "Surgical", - 306200004: "Surgical", - 306201000: "Surgical", - 306237005: "Medical", - 306285006: "Medical", - 306802002: "Medical", - 306934005: "Surgical", - 307374004: "Medical", - 307375003: "Community / OPD", - 307376002: "Community / OPD", - 307380007: "Community / OPD", - 327121000000104: "Surgical", - 353961000000104: "Community / OPD", - 380241000000107: "Psychiatric", - 382271000000102: "Critical Care", - 384711009: "Surgical", - 384712002: "Surgical", - 38670004: "Community / OPD", - 413127007: "Psychiatric", - 415263003: "Community / OPD", - 4266003: "Community / OPD", - 516511000000107: "Community / OPD", - 61801003: "Community / OPD", - 770411000000102: "Local Medical", - 770677000: "Critical Care", - 78429003: "Community / OPD", - 785621000000108: "Community / OPD", - 785681000000109: "Community / OPD", - 785701000000106: "Community / OPD", - 785721000000102: "Community / OPD", - 785761000000105: "Community / OPD", - 785781000000101: "Community / OPD", - 811391000000104: "Community / OPD", - 818861000000107: "Community / OPD", - 823961000000102: "Community / OPD", - 894171000000100: "Community / OPD", - 898791000000105: "Medical", - 975951000000109: "Critical Care", -} - -disstatus = { - 0: np.nan, - 1066301000000103: "Non-urgent", - 1066311000000101: "Non-urgent", - 1066321000000107: "Non-urgent", - 1077021000000100: "Non-urgent", - 1077031000000103: "Urgent", - 1077041000000107: "Urgent", - 1077051000000105: "Urgent", - 1077061000000108: "Urgent", - 1077071000000101: "Urgent", - 1077081000000104: "Urgent", - 1077091000000102: "Urgent", - 1077101000000105: "Urgent", - 1077781000000101: "Urgent", - 1324201000000109: "Urgent", - 182992009: "Non-urgent", - 63238001: "Died", - 75004002: "Died", -} - - admisorc = { "19": "Residence", "29": "Residence", @@ -389,6 +156,356 @@ } +def generate_map(name: str, feature_r: dict) -> dict: + + # First generate a reverse map as snomed_code:category + + feature = { + snomed_code: category + for category, snomed_list in feature_r.items() + for snomed_code in snomed_list + } + + # Get the members of the refset from nhsdd_snomed + # This file has been automatically generated from the Ontology Server + refset_members = eval(f"nhsdd_snomed.{name}['members']") + + # Create a set of all snomed codes in feature + feature_members = feature.keys() + + # Unmapped codes are the codes in the refset that are not in feature + # For each code in refset that is not in feature, set to 'unmapped' + + for i in refset_members: + if i not in feature_members: + feature[i] = "ERROR:Unmapped - In Refset" + + # For codes that appear in the mapping but not in the refset + # append '|Not-In-Refset' tp existing value + + for k, v in feature.items(): + if k not in refset_members: + feature[k] = "ERROR:Not In Refset|" + v + + feature[0] = "ERROR:Missing Data" + + # Add in a placeholder for codes that are neither in the featuremap nor in refset + # These are for unforeseen values that may appear in the source data + + feature[1] = "ERROR:Unmapped - Not In Refset" + + return feature + + +############################################################################## +# accommodationstatus +############################################################################## +accommodationstatus_r = { + "Unknown": [1064831000000106, 1064841000000102, 1066881000000100], + "Yes": [160734000], + "No": [224221006, 224225002, 224231004, 32911000, 394923006, 414418009], +} + +accommodationstatus = generate_map("accommodationstatus", accommodationstatus_r) + +############################################################################## +# edarrivalmode +############################################################################## +edarrivalmode_r = { + "Walk-In": [1048061000000105, 1048071000000103], +} + +edarrivalmode = generate_map("edarrivalmode", edarrivalmode_r) + +############################################################################## +# edattendsource +############################################################################## +edattendsource_r = { + "Community": [ + 1052681000000105, + 1065401000000101, + 1065991000000100, + 1066001000000101, + 1066011000000104, + 1077191000000103, + 1077201000000101, + 1077211000000104, + 1077761000000105, + 185363009, + 185366001, + 185368000, + 185369008, + 877171000000103, + ], + "Personal": [1065391000000104, 315261000000101, 507291000000100], + "Emergency Services": [ + 1066021000000105, + 1066031000000107, + 1066041000000103, + 1066051000000100, + 1066061000000102, + 198261000000104, + 889801000000100, + ], + "Hospital": [ + 1066431000000102, + 1066441000000106, + 1079521000000104, + 835091000000109, + 835101000000101, + ], + "Primary Care": [1082331000000106, 166941000000106, 276491000, 879591000000102], + "Private Referral": [183877003], +} + +edattendsource = generate_map("edattendsource", edattendsource_r) + +############################################################################## +# edacuity +############################################################################## +edacuity = { + 0: "ERROR:Missing Data", + 1064891000000107: "1 - Immediate care level emergency care", + 1064901000000108: "3 - Urgent level emergency care", + 1064911000000105: "2 - Very urgent level emergency care", + 1077241000000103: "4 - Standard level emergency care", + 1077251000000100: "5 - Low acuity level emergency care", +} + +############################################################################## +# edinvest +############################################################################## + +edinvest_r = { + "Non-urgent": [167252002, 27171005, 53115007, 67900009], + "Urgent": [ + 104686004, + 105000003, + 113091000, + 16254007, + 16310003, + 164729009, + 165320004, + 167036008, + 16830007, + 168338000, + 168537006, + 179929004, + 252167001, + 252316009, + 252375001, + 26604007, + 26958001, + 269874008, + 270982000, + 271232007, + 282096008, + 29303009, + 29893006, + 30088009, + 3116009, + 35650009, + 363255004, + 392010000, + 397798009, + 401294003, + 40701008, + 416838001, + 43396009, + 55235003, + 56027003, + 60170009, + 61911006, + 62847008, + 68793005, + 70648006, + 74500006, + 77477000, + 86944008, + 89659001, + ], + "No-investigation": [1088291000000101], +} + +edinvest = generate_map("edinvest", edinvest_r) + +############################################################################## +# edtreat +############################################################################## +edtreat_r = { + "Non-urgent": [ + 183964008, + 266712008, + 413334001, + 81733005, + ] +} + +edtreat = generate_map("edtreat", edtreat_r) + +############################################################################## +# eddiag_seasonal +############################################################################## +eddiag_seasonal_r = { + "Chronic disease exacerbation": [12295008, 13645005, 195951007, 195967001], + "Respiratory infection": [ + 1325161000000102, + 1325171000000109, + 1325181000000106, + 205237003, + 233604007, + 278516003, + 36971009, + 50417007, + 54150009, + 6142004, + 62994001, + 80384002, + 90176007, + ], +} + +eddiag_seasonal = generate_map("eddiag", eddiag_seasonal_r) + +############################################################################## +# eddiagqual +############################################################################## +eddiagqual = { + 415684004: "Suspected", + 410605003: "Confirmed", +} + +############################################################################## +# edattenddispatch +############################################################################## +edattenddispatch_r = { + "Ambulatory / Short Stay": [1066331000000109, 1066341000000100, 1066351000000102], + "Admitted": [ + 1066361000000104, + 1066371000000106, + 1066381000000108, + 1066391000000105, + 1066401000000108, + 306706006, + ], + "Transfer": [183919006, 19712007], + "Died": [305398007], + "Discharged": [306689006, 306691003, 306694006, 306705005, 50861005], +} + +edattenddispatch = generate_map("edattenddispatch", edattenddispatch_r) + +############################################################################## +# edrefservice +############################################################################## +edrefservice_r = { + "Medical": [ + 1064851000000104, + 183516009, + 183518005, + 183519002, + 183521007, + 183522000, + 183523005, + 306111000, + 306114008, + 306118006, + 306123006, + 306124000, + 306125004, + 306127007, + 306140002, + 306148009, + 306237005, + 306285006, + 306802002, + 307374004, + 898791000000105, + ], + "Psychiatric": [ + 183524004, + 202291000000107, + 306136006, + 306138007, + 380241000000107, + 413127007, + ], + "Surgical": [ + 183542009, + 183543004, + 183544005, + 183545006, + 183546007, + 306182003, + 306184002, + 306198005, + 306200004, + 306201000, + 306934005, + 327121000000104, + 384711009, + 384712002, + ], + "ObGyn": [183548008, 183549000], + "Local Medical": [183561008, 276490004, 306152009, 770411000000102], + "Community / OPD": [ + 247541000000106, + 306129005, + 307375003, + 307376002, + 307380007, + 353961000000104, + 38670004, + 415263003, + 4266003, + 516511000000107, + 61801003, + 78429003, + 785621000000108, + 785681000000109, + 785701000000106, + 785721000000102, + 785761000000105, + 785781000000101, + 811391000000104, + 818861000000107, + 823961000000102, + 894171000000100, + ], + "Critical Care": [306107006, 382271000000102, 770677000, 975951000000109], +} + +edrefservice = generate_map("edrefservice", edrefservice_r) + +############################################################################## +# disstatus +############################################################################## +disstatus_r = { + "Non-urgent": [ + 1066301000000103, + 1066311000000101, + 1066321000000107, + 1077021000000100, + 182992009, + ], + "Urgent": [ + 1077031000000103, + 1077041000000107, + 1077051000000105, + 1077061000000108, + 1077071000000101, + 1077081000000104, + 1077091000000102, + 1077101000000105, + 1077781000000101, + 1324201000000109, + ], + "Died": [63238001, 75004002], +} + +disstatus = generate_map("disstatus", disstatus_r) + + @lru_cache(maxsize=1) def load_apc_acsc_mapping() -> Dict[str, str]: """Download ICD10 to Ambulatory Care Sensitive Conditions mapping from Sheffield Google Docs @@ -425,4 +542,35 @@ def load_ed_acsc_mapping() -> Dict[str, str]: acsc.columns = acsc.columns.str.lower().str.replace("[^a-z0-9]+", "_", regex=True) acsc_mapping = acsc.set_index("snomed_code").aec_clinical_conditions.to_dict() + # Set ERROR codes to allow validation to pass after feature engineering + # TODO: Tidy this up + + # Get the members of the refset from nhsdd_snomed + # This file has been automatically generated from the Ontology Server + refset_members = nhsdd_snomed.eddiag["members"] + + # Create a set of all snomed codes in feature + feature_members = acsc_mapping.keys() + + # Unmapped codes are the codes in the refset that are not in feature + # For each code in refset that is not in feature, set to 'unmapped' + + for i in refset_members: + if i not in feature_members: + acsc_mapping[i] = "ERROR:Unmapped - In Refset" + + # For codes that appear in the mapping but not in the refset + # append '|Not-In-Refset' tp existing value + + for k, v in acsc_mapping.items(): + if k not in refset_members: + acsc_mapping[k] = "ERROR:Mapped - Not In Refset|" + v + + acsc_mapping[0] = "ERROR:Missing Data" + + # Add in a placeholder for codes that are neither in the featuremap nor in refset + # These are for unforeseen values that may appear in the source data + + acsc_mapping[1] = "ERROR:Unmapped - Not In Refset" + return acsc_mapping diff --git a/docs/features.md b/docs/features.md index a1b4e22..1dce737 100644 --- a/docs/features.md +++ b/docs/features.md @@ -10,6 +10,23 @@ The functions described below generate these features automatically in preparati Ensure that data has undergone preprocessing and has passed the first validation step as described in the [analysis pipeline][data-analysis-pipeline] before using these functions. +## Error codes + +A pragmatic approach has been used in dealing with missing data, unmapped codes and codes not in refsets. +Please read section on [missing values][missing-values] in the [Data Validation][data-validation] chapter as well. + +During feature engineering, especially in the Emergency Care dataset that has several columns with SNOMED codes, the following rules are applied to assign the appropriate categories. + +| Source Data | Mapping | Refset | Category | Who fixes | +|:-------------:|:---------:|:---------:|---------------------------------------|-----------------------| +| Yes | Yes | Yes | Assign to `Category` | | +| Yes | No | Yes | `ERROR:Unmapped - In Refset` | Lead site to advise | +| Yes | Yes | No | `ERROR:Not In Refset|{Category}` | Lead site to fix | +| No | x | x | `ERROR:Missing Data` | Local site if feasible| +| Yes | No | No | `ERROR:Unmapped - Not In Refset` | Local site to fix | + +Please see the source code for [`feature_maps.py`](https://github.com/LTHTR-DST/hdruk_avoidable_admissions/blob/dev/avoidable_admissions/features/feature_maps.py) and raise a [GitHub issue](https://github.com/LTHTR-DST/hdruk_avoidable_admissions/issues) for any questions or bugs. + ::: avoidable_admissions.features.build_features handler: python options: diff --git a/docs/index.md b/docs/index.md index 7918499..e444ea4 100644 --- a/docs/index.md +++ b/docs/index.md @@ -33,8 +33,8 @@ To install with optional dependecies for _contributing to development and docume pip install "avoidable_admissions[dev] @ git+https://github.com/LTHTR-DST/hdruk_avoidable_admissions.git@" ``` -Replace `` with the latest release version e.g. `v.0.2.1-alpha`. List of releases can be found here - . +Replace `` with the latest release version e.g. `v.0.3.0`. List of releases can be found here - . -Omit `` to install the latest code in the repo. +Omit `@` to install the latest code in the repo. See for a complete example. diff --git a/docs/validation.md b/docs/validation.md index 5d531e4..c402836 100644 --- a/docs/validation.md +++ b/docs/validation.md @@ -37,7 +37,7 @@ df['admidate'] = df['admidate'].dt.date df['accommodationstatus'] = df['accommodationstatus'].fillna(0) ``` -## Mising Values +## Missing Values _To be finalised after further discussion and testing._ @@ -48,4 +48,4 @@ For the purposes of this project, several pragmatic choices have been made regar 1. Where a definition exists for how missing values should be coded, for instance in the NHS data model, use this. 2. For SNOMED codes, which are always integers, use 0 (zero) to replace all missing values. This avoids validation errors caused by `NaN` values that are treated as `float` dtype by Pandas. 3. For strings, use `"-"` (without the quotes) for missing values. -4. During feature engineering, if a code has not been assigned a category in the specification, the value `"Other"` is assigned. +4. During [feature engineering][feature-engineering], custom error values are assigned to codes that are missing from either the refsets or mapping.