EPHHeatRelatedIllness_auto_refresh

datacommonsorg · Jan 3, 2025 · e5debfb · e5debfb
1 parent 6c95044
commit e5debfb
Show file tree

Hide file tree

Showing 17 changed files with 383 additions and 777 deletions.
diff --git a/scripts/us_cdc/heat_related_illness/README.md b/scripts/us_cdc/heat_related_illness/README.md
@@ -3,27 +3,21 @@
 This directory imports [Heat and Heat-related Illness](https://ephtracking.cdc.gov/qrlist/35) from EPH Tracking into Data Commons. It includes data at a state level.
 
 ## Cleaning source data
-The source data is downloaded manually from the EPH [website](https://ephtracking.cdc.gov/qrlist/35). Currently, this import brings in data related to [Heat-related Emergency Department Visits](https://ephtracking.cdc.gov/qrd/438), [Heat-Related Mortality](https://ephtracking.cdc.gov/qrd/370), and [Heat-related Hospitalizations](https://ephtracking.cdc.gov/qrd/431).
-To clean the source data, run:
+The source data is available from the EPH [website](https://ephtracking.cdc.gov/qrlist/35). Currently, this import brings in data related to [Heat-related Emergency Department Visits](https://ephtracking.cdc.gov/qrd/438), [Heat-Related Mortality](https://ephtracking.cdc.gov/qrd/370), and [Heat-related Hospitalizations](https://ephtracking.cdc.gov/qrd/431).
 
-```bash
-python clean_data.py --input_path=source_data/ --output_path=<output_path>
-```
-
-## Generating artifacts at a State level:
-The artifacts can be generated from the cleaned data.
-To generate `cleaned.csv`, `output.mcf` run:
+To download and clean the source data, run:
 
 ```bash
-python preprocess.py --input_path=<directory path to cleaned data> --config_path=<path to config> --output_path=<directory path to write csv and mcf>
+python clean_data.py
 ```
 
-## Aggregating at a Country level
-At a country level, aggregation is performed by summing over the state level `cleaned.csv`.
-To aggregate run:
+## Generating artifacts at a State level & Aggregating at a Country level:
+The artifacts can be generated from the cleaned data and At a country level, aggregation is performed by summing over the state level `cleaned.csv` and country level data is generated as `state/country_output.csv`.
+
+To generate `cleaned.csv`, `output.mcf`, `output.tmcf` and `country_output.csv`run:
 
 ```bash
-python aggregate.py --input_path=<path to state level csv> --output_path=<output csv path>
+python preprocess.py
 ```
 
 ## Data Caveats:

diff --git a/scripts/us_cdc/heat_related_illness/preprocess.py b/scripts/us_cdc/heat_related_illness/preprocess.py
@@ -32,13 +32,13 @@
 
 _CONFIG = None
 
-# Columns in cleaned CSV
-_OUTPUT_COLUMNS = ('Year', 'StatVar', 'Quantity', 'Geo', 'measurementMethod')
 input_path = "./source_data/combined_csv_files/"
-
 config_path = "./config.json"
 output_path = "./state"
 
+# Columns in cleaned CSV
+_OUTPUT_COLUMNS = ('Year', 'StatVar', 'Quantity', 'Geo', 'measurementMethod')
+
 
 def generate_tmcf():
     _TMCF_TEMPLATE = ("Node: E:EPHHeatIllness->E0\n"
@@ -155,36 +155,38 @@ def aggregate():
     country_df.to_csv(output_path + "/country_output.csv", index=False)
 
 
+def process(cleaned_csv_path, output_mcf_path, input_path):
+    global _CONFIG
+    with open(config_path, 'r', encoding='utf-8') as config_f:
+        _CONFIG = json.load(config_f)
+
+    with open(cleaned_csv_path, 'w', encoding='utf-8') as cleaned_f:
+        f_writer = csv.DictWriter(cleaned_f, fieldnames=_OUTPUT_COLUMNS)
+        f_writer.writeheader()
+        statvar_list = []
+        for file_name in os.listdir(input_path):
+            if file_name.endswith('.csv'):
+                file_path = os.path.join(input_path, file_name)
+                with open(file_path, 'r', encoding='utf-8') as csv_f:
+                    f_reader = csv.DictReader(csv_f,
+                                              delimiter=',',
+                                              quotechar='"')
+                    statvars = _process_file(file_name, f_reader, f_writer)
+                    if statvars:
+                        statvar_list.extend(statvars)
+        write_to_mcf(statvar_list, output_mcf_path)
+
+
 def main(argv):
     try:
-        global _CONFIG
-        with open(config_path, 'r', encoding='utf-8') as config_f:
-            _CONFIG = json.load(config_f)
-
         cleaned_csv_path = os.path.join(output_path, 'cleaned.csv')
         output_mcf_path = os.path.join(output_path, 'output.mcf')
-
-        with open(cleaned_csv_path, 'w', encoding='utf-8') as cleaned_f:
-            f_writer = csv.DictWriter(cleaned_f, fieldnames=_OUTPUT_COLUMNS)
-            f_writer.writeheader()
-
-            statvar_list = []
-            for file_name in os.listdir(input_path):
-                if file_name.endswith('.csv'):
-                    file_path = os.path.join(input_path, file_name)
-                    with open(file_path, 'r', encoding='utf-8') as csv_f:
-                        f_reader = csv.DictReader(csv_f,
-                                                  delimiter=',',
-                                                  quotechar='"')
-                        statvars = _process_file(file_name, f_reader, f_writer)
-                        if statvars:
-                            statvar_list.extend(statvars)
-
-            write_to_mcf(statvar_list, output_mcf_path)
+        process(cleaned_csv_path, output_mcf_path, input_path)
         generate_tmcf()
         aggregate()
+        logging.info("Processing completed!")
     except Exception as e:
-        logging.fatal(f"Processing Error - {e}")
+        logging.fatal(f"Encountered some issue with process - {e}")
 
 
 if __name__ == "__main__":

diff --git a/scripts/us_cdc/heat_related_illness/preprocess_test.py b/scripts/us_cdc/heat_related_illness/preprocess_test.py
@@ -16,40 +16,50 @@
 import os
 import unittest
 import tempfile
-import subprocess
+from preprocess import process
 
 _SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
 
 
 class EPHHeatRelatedIllness(unittest.TestCase):
 
+    def __init__(self, methodName: str = ...) -> None:
+        super().__init__(methodName)
+        self.exp_path = os.path.join(_SCRIPT_PATH, 'testdata', 'expected_files')
+        self.act_path = os.path.join(_SCRIPT_PATH, 'testdata', 'actual_files')
+        csv_path = os.path.join(self.act_path, 'cleaned.csv')
+        mcf_path = os.path.join(self.act_path, 'output.mcf')
+        input_path = os.path.join(_SCRIPT_PATH, 'testdata', 'cleaned_data/')
+        try:
+            os.makedirs(self.act_path)
+        except FileExistsError:
+            pass  # Directory already exists
+
+        # generates actual output in tmp directory
+        process(csv_path, mcf_path, input_path)
+
+        with open(csv_path, 'r', encoding='utf-8') as f_result:
+            self.actual_csv_result = f_result.read()
+
+        with open(mcf_path, 'r', encoding='utf-8') as f_result:
+            self.actual_mcf_result = f_result.read()
+
     def test_csv(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            csv_path = os.path.join(tmp_dir, 'cleaned.csv')
-            mcf_path = os.path.join(tmp_dir, 'output.mcf')
-            preprocess_path = os.path.join(_SCRIPT_PATH, 'preprocess.py')
-            config_path = os.path.join(_SCRIPT_PATH, 'config.json')
-            input_path = os.path.join(_SCRIPT_PATH, 'testdata', 'cleaned_data')
-
-            subprocess.call([
-                'python', preprocess_path, f'--input_path={input_path}',
-                f'--config_path={config_path}', f'--output_path={tmp_dir}'
-            ])
-
-            with open(csv_path, 'r', encoding='utf-8') as f_result:
-                test_result = f_result.read()
-                expected_csv_path = os.path.join(_SCRIPT_PATH, 'testdata',
-                                                 'expected.csv')
-                with open(expected_csv_path, 'r',
-                          encoding='utf-8') as f_expected:
-                    expected_result = f_expected.read()
-                self.assertEqual(test_result, expected_result)
-
-            with open(mcf_path, 'r', encoding='utf-8') as f_result:
-                test_result = f_result.read()
-                expected_mcf_path = os.path.join(_SCRIPT_PATH, 'testdata',
-                                                 'expected_output.mcf')
-                with open(expected_mcf_path, 'r',
-                          encoding='utf-8') as f_expected:
-                    expected_result = f_expected.read()
-                self.assertEqual(test_result, expected_result)
+        expected_csv_path = os.path.join(self.exp_path, 'expected.csv')
+
+        with open(expected_csv_path, 'r', encoding='utf-8') as f_expected:
+            expected_result = f_expected.read()
+
+        self.assertEqual(self.actual_csv_result, expected_result)
+
+    def test_mcf(self):
+        expected_mcf_path = os.path.join(self.exp_path, 'expected_output.mcf')
+
+        with open(expected_mcf_path, 'r', encoding='utf-8') as f_expected:
+            expected_result = f_expected.read()
+
+        self.assertEqual(self.actual_mcf_result, expected_result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/scripts/us_cdc/heat_related_illness/testdata/cleaned_data/deaths.csv b/scripts/us_cdc/heat_related_illness/testdata/cleaned_data/deaths.csv
@@ -1,21 +1,10 @@
-Year,State,Data Value
-2000,California,41
-2001,California,25
-2002,California,43
-2003,California,38
-2004,California,33
-2005,California,62
-2006,California,188
-2007,California,67
-2008,California,42
-2009,California,54
-2010,California,39
-2011,California,26
-2012,California,61
-2013,California,66
-2014,California,47
-2015,California,49
-2016,California,67
-2017,California,98
-2018,California,98
-2019,California,57
+Year,State,Data Value
+2016,Alabama,12
+2017,Arizona,177
+2018,Arkansas,13
+2019,California,128
+2020,Colorado,11
+2021,Connecticut,Suppressed
+2000,Delaware,Suppressed
+2001,District of Columbia,Suppressed
+2002,Florida,41
diff --git a/scripts/us_cdc/heat_related_illness/testdata/cleaned_data/edVisit_age.csv b/scripts/us_cdc/heat_related_illness/testdata/cleaned_data/edVisit_age.csv
@@ -0,0 +1,11 @@
+Year,State,Data Value,Age Group
+2010,Alaska,0,0 TO 4
+2011,Alaska,0,5 TO 14
+2012,Alaska,2,15 TO 34
+2013,Alaska,0,35 TO 64
+2014,Alaska,0,>= 65
+2015,Arizona,27,0 TO 4
+2016,Arizona,92,5 TO 14
+2017,Arizona,620,15 TO 34
+2018,Arizona,681,35 TO 64
+2019,Arizona,153,>= 65
diff --git a/scripts/us_cdc/heat_related_illness/testdata/cleaned_data/edVisits.csv b/scripts/us_cdc/heat_related_illness/testdata/cleaned_data/edVisits.csv
@@ -0,0 +1,10 @@
+Year,State,Data Value
+2015,Arizona,"1,978"
+2016,California,"5,309"
+2017,Colorado,355
+2018,Connecticut,348
+2019,Florida,"5,209"
+2020,Iowa,774
+2021,Kansas,813
+2022,Kentucky,"1,479"
+2000,Louisiana,"2,836"
diff --git a/scripts/us_cdc/heat_related_illness/testdata/cleaned_data/edVisits_age_gender.csv b/scripts/us_cdc/heat_related_illness/testdata/cleaned_data/edVisits_age_gender.csv