Skip to content

Commit

Permalink
EPHHeatRelatedIllness_auto_refresh
Browse files Browse the repository at this point in the history
  • Loading branch information
SudhishaK committed Jan 3, 2025
1 parent 6c95044 commit e5debfb
Show file tree
Hide file tree
Showing 17 changed files with 383 additions and 777 deletions.
22 changes: 8 additions & 14 deletions scripts/us_cdc/heat_related_illness/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,21 @@
This directory imports [Heat and Heat-related Illness](https://ephtracking.cdc.gov/qrlist/35) from EPH Tracking into Data Commons. It includes data at a state level.

## Cleaning source data
The source data is downloaded manually from the EPH [website](https://ephtracking.cdc.gov/qrlist/35). Currently, this import brings in data related to [Heat-related Emergency Department Visits](https://ephtracking.cdc.gov/qrd/438), [Heat-Related Mortality](https://ephtracking.cdc.gov/qrd/370), and [Heat-related Hospitalizations](https://ephtracking.cdc.gov/qrd/431).
To clean the source data, run:
The source data is available from the EPH [website](https://ephtracking.cdc.gov/qrlist/35). Currently, this import brings in data related to [Heat-related Emergency Department Visits](https://ephtracking.cdc.gov/qrd/438), [Heat-Related Mortality](https://ephtracking.cdc.gov/qrd/370), and [Heat-related Hospitalizations](https://ephtracking.cdc.gov/qrd/431).

```bash
python clean_data.py --input_path=source_data/ --output_path=<output_path>
```

## Generating artifacts at a State level:
The artifacts can be generated from the cleaned data.
To generate `cleaned.csv`, `output.mcf` run:
To download and clean the source data, run:

```bash
python preprocess.py --input_path=<directory path to cleaned data> --config_path=<path to config> --output_path=<directory path to write csv and mcf>
python clean_data.py
```

## Aggregating at a Country level
At a country level, aggregation is performed by summing over the state level `cleaned.csv`.
To aggregate run:
## Generating artifacts at a State level & Aggregating at a Country level:
The artifacts can be generated from the cleaned data and At a country level, aggregation is performed by summing over the state level `cleaned.csv` and country level data is generated as `state/country_output.csv`.

To generate `cleaned.csv`, `output.mcf`, `output.tmcf` and `country_output.csv`run:

```bash
python aggregate.py --input_path=<path to state level csv> --output_path=<output csv path>
python preprocess.py
```

## Data Caveats:
Expand Down
54 changes: 28 additions & 26 deletions scripts/us_cdc/heat_related_illness/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,13 @@

_CONFIG = None

# Columns in cleaned CSV
_OUTPUT_COLUMNS = ('Year', 'StatVar', 'Quantity', 'Geo', 'measurementMethod')
input_path = "./source_data/combined_csv_files/"

config_path = "./config.json"
output_path = "./state"

# Columns in cleaned CSV
_OUTPUT_COLUMNS = ('Year', 'StatVar', 'Quantity', 'Geo', 'measurementMethod')


def generate_tmcf():
_TMCF_TEMPLATE = ("Node: E:EPHHeatIllness->E0\n"
Expand Down Expand Up @@ -155,36 +155,38 @@ def aggregate():
country_df.to_csv(output_path + "/country_output.csv", index=False)


def process(cleaned_csv_path, output_mcf_path, input_path):
global _CONFIG
with open(config_path, 'r', encoding='utf-8') as config_f:
_CONFIG = json.load(config_f)

with open(cleaned_csv_path, 'w', encoding='utf-8') as cleaned_f:
f_writer = csv.DictWriter(cleaned_f, fieldnames=_OUTPUT_COLUMNS)
f_writer.writeheader()
statvar_list = []
for file_name in os.listdir(input_path):
if file_name.endswith('.csv'):
file_path = os.path.join(input_path, file_name)
with open(file_path, 'r', encoding='utf-8') as csv_f:
f_reader = csv.DictReader(csv_f,
delimiter=',',
quotechar='"')
statvars = _process_file(file_name, f_reader, f_writer)
if statvars:
statvar_list.extend(statvars)
write_to_mcf(statvar_list, output_mcf_path)


def main(argv):
try:
global _CONFIG
with open(config_path, 'r', encoding='utf-8') as config_f:
_CONFIG = json.load(config_f)

cleaned_csv_path = os.path.join(output_path, 'cleaned.csv')
output_mcf_path = os.path.join(output_path, 'output.mcf')

with open(cleaned_csv_path, 'w', encoding='utf-8') as cleaned_f:
f_writer = csv.DictWriter(cleaned_f, fieldnames=_OUTPUT_COLUMNS)
f_writer.writeheader()

statvar_list = []
for file_name in os.listdir(input_path):
if file_name.endswith('.csv'):
file_path = os.path.join(input_path, file_name)
with open(file_path, 'r', encoding='utf-8') as csv_f:
f_reader = csv.DictReader(csv_f,
delimiter=',',
quotechar='"')
statvars = _process_file(file_name, f_reader, f_writer)
if statvars:
statvar_list.extend(statvars)

write_to_mcf(statvar_list, output_mcf_path)
process(cleaned_csv_path, output_mcf_path, input_path)
generate_tmcf()
aggregate()
logging.info("Processing completed!")
except Exception as e:
logging.fatal(f"Processing Error - {e}")
logging.fatal(f"Encountered some issue with process - {e}")


if __name__ == "__main__":
Expand Down
70 changes: 40 additions & 30 deletions scripts/us_cdc/heat_related_illness/preprocess_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,40 +16,50 @@
import os
import unittest
import tempfile
import subprocess
from preprocess import process

_SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))


class EPHHeatRelatedIllness(unittest.TestCase):

def __init__(self, methodName: str = ...) -> None:
super().__init__(methodName)
self.exp_path = os.path.join(_SCRIPT_PATH, 'testdata', 'expected_files')
self.act_path = os.path.join(_SCRIPT_PATH, 'testdata', 'actual_files')
csv_path = os.path.join(self.act_path, 'cleaned.csv')
mcf_path = os.path.join(self.act_path, 'output.mcf')
input_path = os.path.join(_SCRIPT_PATH, 'testdata', 'cleaned_data/')
try:
os.makedirs(self.act_path)
except FileExistsError:
pass # Directory already exists

# generates actual output in tmp directory
process(csv_path, mcf_path, input_path)

with open(csv_path, 'r', encoding='utf-8') as f_result:
self.actual_csv_result = f_result.read()

with open(mcf_path, 'r', encoding='utf-8') as f_result:
self.actual_mcf_result = f_result.read()

def test_csv(self):
with tempfile.TemporaryDirectory() as tmp_dir:
csv_path = os.path.join(tmp_dir, 'cleaned.csv')
mcf_path = os.path.join(tmp_dir, 'output.mcf')
preprocess_path = os.path.join(_SCRIPT_PATH, 'preprocess.py')
config_path = os.path.join(_SCRIPT_PATH, 'config.json')
input_path = os.path.join(_SCRIPT_PATH, 'testdata', 'cleaned_data')

subprocess.call([
'python', preprocess_path, f'--input_path={input_path}',
f'--config_path={config_path}', f'--output_path={tmp_dir}'
])

with open(csv_path, 'r', encoding='utf-8') as f_result:
test_result = f_result.read()
expected_csv_path = os.path.join(_SCRIPT_PATH, 'testdata',
'expected.csv')
with open(expected_csv_path, 'r',
encoding='utf-8') as f_expected:
expected_result = f_expected.read()
self.assertEqual(test_result, expected_result)

with open(mcf_path, 'r', encoding='utf-8') as f_result:
test_result = f_result.read()
expected_mcf_path = os.path.join(_SCRIPT_PATH, 'testdata',
'expected_output.mcf')
with open(expected_mcf_path, 'r',
encoding='utf-8') as f_expected:
expected_result = f_expected.read()
self.assertEqual(test_result, expected_result)
expected_csv_path = os.path.join(self.exp_path, 'expected.csv')

with open(expected_csv_path, 'r', encoding='utf-8') as f_expected:
expected_result = f_expected.read()

self.assertEqual(self.actual_csv_result, expected_result)

def test_mcf(self):
expected_mcf_path = os.path.join(self.exp_path, 'expected_output.mcf')

with open(expected_mcf_path, 'r', encoding='utf-8') as f_expected:
expected_result = f_expected.read()

self.assertEqual(self.actual_mcf_result, expected_result)


if __name__ == '__main__':
unittest.main()
Original file line number Diff line number Diff line change
@@ -1,21 +1,10 @@
Year,State,Data Value
2000,California,41
2001,California,25
2002,California,43
2003,California,38
2004,California,33
2005,California,62
2006,California,188
2007,California,67
2008,California,42
2009,California,54
2010,California,39
2011,California,26
2012,California,61
2013,California,66
2014,California,47
2015,California,49
2016,California,67
2017,California,98
2018,California,98
2019,California,57
Year,State,Data Value
2016,Alabama,12
2017,Arizona,177
2018,Arkansas,13
2019,California,128
2020,Colorado,11
2021,Connecticut,Suppressed
2000,Delaware,Suppressed
2001,District of Columbia,Suppressed
2002,Florida,41
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Year,State,Data Value,Age Group
2010,Alaska,0,0 TO 4
2011,Alaska,0,5 TO 14
2012,Alaska,2,15 TO 34
2013,Alaska,0,35 TO 64
2014,Alaska,0,>= 65
2015,Arizona,27,0 TO 4
2016,Arizona,92,5 TO 14
2017,Arizona,620,15 TO 34
2018,Arizona,681,35 TO 64
2019,Arizona,153,>= 65
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Year,State,Data Value
2015,Arizona,"1,978"
2016,California,"5,309"
2017,Colorado,355
2018,Connecticut,348
2019,Florida,"5,209"
2020,Iowa,774
2021,Kansas,813
2022,Kentucky,"1,479"
2000,Louisiana,"2,836"

This file was deleted.

Loading

0 comments on commit e5debfb

Please sign in to comment.