From 98911cd989ce092dab3e4b4df483f38966875647 Mon Sep 17 00:00:00 2001
From: ccarenzoIC <c.carenzo@imperial.ac.uk>
Date: Wed, 8 Mar 2023 09:10:35 +0000
Subject: [PATCH] issue #36

---
 avoidable_admissions/data/validate.py         |  9 +++
 .../features/emergency_care_features.py       | 11 ++++
 avoidable_admissions/features/feature_maps.py | 59 ++++++++++++++++++-
 3 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/avoidable_admissions/data/validate.py b/avoidable_admissions/data/validate.py
index 01e9044..a5e3a79 100644
--- a/avoidable_admissions/data/validate.py
+++ b/avoidable_admissions/data/validate.py
@@ -520,6 +520,15 @@ class Config:
                 )
             ],
         ),
+        "edchiefcomplaint_cat": pa.Column(
+            # nullable=True,
+            checks=[
+                pa.Check.isin(
+                    set([*feature_maps.load_ed_cc_mapping().values(), "-"]),
+                    ignore_na=True,
+                )
+            ],
+        ),
     }
 )
 
diff --git a/avoidable_admissions/features/emergency_care_features.py b/avoidable_admissions/features/emergency_care_features.py
index 6323786..ecc3642 100644
--- a/avoidable_admissions/features/emergency_care_features.py
+++ b/avoidable_admissions/features/emergency_care_features.py
@@ -160,6 +160,16 @@ def _disstatus(df: pd.DataFrame) -> pd.DataFrame:
 
     return df
 
+def _cc_code(df: pd.DataFrame) -> pd.DataFrame:
+
+    # TODO: This section needs manual review of a good sample size to ensure it works
+
+    cc_mapping = load_ed_cc_mapping()
+    df["edchiefcomplaint_cat"] = replace_values(df.edchiefcomplaint, cc_mapping)
+
+    return df
+
+
 
 def build_all(df: pd.DataFrame) -> pd.DataFrame:
 
@@ -167,6 +177,7 @@ def build_all(df: pd.DataFrame) -> pd.DataFrame:
         df.pipe(_age)
         .pipe(_accommodationstatus)
         .pipe(_acsc_code)
+        .pipe(_cc_code)
         .pipe(_disstatus)
         .pipe(_edacuity)
         .pipe(_edarivalemode)
diff --git a/avoidable_admissions/features/feature_maps.py b/avoidable_admissions/features/feature_maps.py
index 33a2fa9..7c64399 100644
--- a/avoidable_admissions/features/feature_maps.py
+++ b/avoidable_admissions/features/feature_maps.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pandas as pd
+import os.path
 
 from avoidable_admissions.data import nhsdd_snomed
 
@@ -514,7 +515,7 @@ def load_apc_acsc_mapping() -> Dict[str, str]:
 
     # TODO: Store this file locally and hit Google Docs only if there is no local file.
 
-    sheet_id = "1M3uS6qh3d9OY31gFxy8858ZxBiFjGE_Y"  # APC - ACSC V1 20230130
+    sheet_id = "1qTSYlxY12lOKQ3pV6Chd-tgY-msir8yB"  # APC - ACSC V2 20230224
     sheet_name = "Sheet1"
     url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
     acsc = pd.read_csv(url, usecols=[0, 1])
@@ -533,7 +534,7 @@ def load_ed_acsc_mapping() -> Dict[str, str]:
 
     # TODO: Store this file locally and hit Google Docs only if there is no local file.
 
-    sheet_id = "1Jsx4Am9a3Hvv7VJwIFb4z4_oV7zXL39e"  # ECDS - ACSC V5 20230130
+    sheet_id = "1uk3T2XwjtaU3ZEvJCdfGRRvl-pkHtTUM"  # ECDS - ACSC V6 20230224
     sheet_name = "ACSC ECDS and ICD-10"
     url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
     url = url.replace(" ", "%20")
@@ -574,3 +575,57 @@ def load_ed_acsc_mapping() -> Dict[str, str]:
     acsc_mapping[1] = "ERROR:Unmapped - Not In Refset"
 
     return acsc_mapping
+
+@lru_cache(maxsize=1)
+def load_ed_cc_mapping() -> Dict[str, str]:
+    """Download SNOMED codes of chief complaints to determined mapping from Sheffield Google Docs
+    and return a dictionary of snomed_code:cc_category
+    """
+    path_to_file = os.path.exists('data/external/cc_mapping.csv')
+    
+    if(path_to_file):
+        url = path_to_file
+    else:
+        sheet_id = "18XbVmWJsccACoTDFd8EBeslKPtsPApqi"  # Chief_Complaint_Coding_V2
+        sheet_name = "Sheet1"
+        url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
+        url = url.replace(" ", "%20")
+        
+    cc = pd.read_csv(url, usecols=[4, 7])
+    cc.columns = cc.columns.str.strip()
+    cc.columns = cc.columns.str.lower().str.replace("[^a-z0-9]+", "_", regex=True)
+    cc_mapping = cc.set_index("snomed_code").chief_complain_category.to_dict()
+
+    # Set ERROR codes to allow validation to pass after feature engineering
+    # TODO: Tidy this up
+
+    # Get the members of the refset from nhsdd_snomed
+    # This file has been automatically generated from the Ontology Server
+    refset_members = nhsdd_snomed.edchiefcomplaint["members"]
+
+    # Create a set of all snomed codes in feature
+    feature_members = cc_mapping.keys()
+
+    # Unmapped codes are the codes in the refset that are not in feature
+    # For each code in refset that is not in feature, set to 'unmapped'
+
+    for i in refset_members:
+        if i not in feature_members:
+            cc_mapping[i] = "ERROR:Unmapped - In Refset"
+
+    # For codes that appear in the mapping but not in the refset
+    # append '|Not-In-Refset' tp existing value
+
+    for k, v in cc_mapping.items():
+        if k not in refset_members:
+            cc_mapping[k] = "ERROR:Mapped - Not In Refset|" + v
+
+    cc_mapping[0] = "ERROR:Missing Data"
+
+    # Add in a placeholder for codes that are neither in the featuremap nor in refset
+    # These are for unforeseen values that may appear in the source data
+
+    cc_mapping[1] = "ERROR:Unmapped - Not In Refset"
+
+    return cc_mapping
+