Skip to content

Commit

Permalink
add more extensive handling of ICD9 codes; the diabetes dataset somet…
Browse files Browse the repository at this point in the history
…imes uses level 4 codes (for diabetes-related tasks); also handle some special cases to ensure every ICD9 code in the dataset is mapped to an appropriate descriptor
  • Loading branch information
jpgard committed Dec 23, 2023
1 parent d5d9262 commit 6922f9e
Showing 1 changed file with 94 additions and 6 deletions.
100 changes: 94 additions & 6 deletions tableshift/datasets/diabetes_readmission.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
"""
import json
import os
import re
from typing import Dict

import pandas as pd
from tableshift.core.features import Feature, FeatureList, cat_dtype
Expand All @@ -34,18 +36,104 @@
decreased, 'steady' if the dosage did not change, and 'no' if the drug
was not prescribed."""

# These codes are missing from the list we used.
SUPERFICIAL_INJURY_ICD9_CODES = {
"910": "Superficial injury of face, neck, and scalp except eye",
"911": "Superficial injury of trunk",
"912": "Superficial injury of shoulder and upper arm",
"913": "Superficial injury of elbow, forearm, and wrist",
"914": "Superficial injury of hand(s) except finger(s) alone",
"915": " Superficial injury of finger(s)",
"916": "Superficial injury of hip, thigh, leg, and ankle",
"917": "Superficial injury of foot and toe(s)",
"918": "Superficial injury of eye and adnexa",
"919": "Superficial injury of other, multiple, and unspecified sites",
"919.0": "Abrasion or friction burn of other multiple and unspecified sites without infection",
"919.1": "Abrasion or friction burn of other multiple and unspecified sites infected",
"919.2": "Blister of other multiple and unspecified sites without infection",
"919.3": "Blister of other multiple and unspecified sites infected",
"919.4": "Insect bite nonvenomous of other multiple and unspecified sites without infection",
"919.5": "Insect bite nonvenomous of other multiple and unspecified sites infected",
"919.6": "Superficial foreign body (splinter) of other multiple and unspecified sites without major open wound and without infection",
"919.7": "Superficial foreign body (splinter) of other multiple and unspecified sites without major open wound infected",
"919.8": "Other and unspecified superficial injury of other multiple and unspecified sites without infection",
"919.9": "Other and unspecified superficial injury of other multiple and unspecified sites infected",
}

def get_icd9(depth=3) -> dict:
"""Fetch a dictionary mapping ICD9 codes to string descriptors."""

def _diabetes_icd9_codes() -> Dict[str, str]:
# See https://en.wikipedia.org/wiki/List_of_ICD-9_codes_240–279:_endocrine,_nutritional_and_metabolic_diseases,_and_immunity_disorders
# First digit of decimal
first_decimal_codes = {0: 'without mention of complication',
1: 'with ketoacidosis',
2: 'with hyperosmolarity',
3: 'with other coma',
4: 'with renal manifestations',
5: 'with ophthalmic manifestations',
6: 'with neurological manifestations',
7: 'with peripheral circulatory disorders',
8: 'with other specified manifestations',
9: 'with unspecified complication',
}

# Second digit of decimal
second_decimal_codes = {
# (250.x0) Diabetes mellitus type 2
0: 'Diabetes mellitus type 2',
# (250.x1) Diabetes mellitus type 1
1: 'Diabetes mellitus type 1',
# (250.x2) Diabetes mellitus type 2, uncontrolled
2: 'Diabetes mellitus type 2, uncontrolled',
# (250.x3) Diabetes mellitus type 1, uncontrolled
3: 'Diabetes mellitus type 1, uncontrolled',
}
diabetes_codes_mapping = {}
for first_decimal, desc in first_decimal_codes.items():
for second_decimal, diabetes_type in second_decimal_codes.items():
code = f'250.{first_decimal}{second_decimal}'
desc = ' '.join((diabetes_type, desc))
diabetes_codes_mapping[code] = desc
return diabetes_codes_mapping


def get_icd9(depths=(3, 4)) -> dict:
"""Fetch a dictionary mapping ICD9 codes to string descriptors.
Also applies some preprocessing (dropping leading zeros) to match the format used in diabetes dataset.
Additionally, since the dataset uses depth-3 codes for non-diabetes diagnoses, but depth-4
codes for diabetes diagnosis (i.e. 250.x), we take the union of both depths such that
every code present in the dataset is mapped.
"""
# via https://raw.githubusercontent.com/sirrice/icd9/master/codes.json
fp = os.path.join(os.path.dirname(__file__), "./icd9-codes.json")
with open(fp, "r") as f:
raw = f.read()
icd9_codes = json.loads(raw)
depth_mapping = {c["code"]: c["descr"]
for group in icd9_codes for c in group
if c['depth'] == depth}
return depth_mapping

def _preprocess_code_text(x: str) -> str:
# Drop any leading zeros in codes
try:
return re.sub("^0+", "", x)
except:
import ipdb;
ipdb.set_trace()

mapping = {}
for depth in depths:
depth_mapping = {_preprocess_code_text(c["code"]): c["descr"]
for group in icd9_codes for c in group
if c['depth'] == depth}
mapping.update(depth_mapping)
# Add the detailed diabetes codes; these are also used in this dataset for diabetes only.
diabetes_icd9_codes = _diabetes_icd9_codes()
mapping.update(diabetes_icd9_codes)
mapping.update(SUPERFICIAL_INJURY_ICD9_CODES)
mapping.update({"235": "Neoplasm of uncertain behavior of digestive and respiratory systems",
"365.44": "Glaucoma associated with congenital anomalies, with dystrophies and with systemic syndromes",
"752": "Congenital anomalies of genital organs",
"E849": "Place of occurrence at Home",
"nan": "Not entered, unknown or missing"})
return mapping


# Note: the UCI version of this dataset does *not* exactly correspond to the
Expand Down

0 comments on commit 6922f9e

Please sign in to comment.