This repository has been archived by the owner on May 30, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
02_extract_and_subset_data.py
130 lines (108 loc) · 5.2 KB
/
02_extract_and_subset_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""
Extract only data of interest (relating to South Gloucestershire) from zip and Shapefiles.
"""
from pathlib import Path
import typing
from zipfile import ZipFile
import pandas as pd
import geopandas as gpd
import config
def extract_from_zip(zip_path: Path, file_suffix: str) -> typing.TextIO:
"""
Extract the first file whose filename ends in `file_suffix` from the zip file at `zip_path` and
return it as a file object.
"""
with ZipFile(zip_path) as zip_file:
for compressed_file in zip_file.filelist:
if compressed_file.filename.startswith('__MACOSX'):
# Skip non-data folders
continue
if compressed_file.filename.endswith(file_suffix):
return zip_file.open(compressed_file)
else:
raise KeyError(f'File ending in "{file_suffix}" not found in {zip_path}')
def extract_census_lsoa_metric(input_file: Path, output_file: Path):
"""
Extract LSOA-level 2021 Census data from the csv `input_file` that is relevant to our area of
interest (South Gloucestershire) and save it to the csv `output_file`.
"""
# Load the relevant size of geography from the input zip file
data = pd.read_csv(extract_from_zip(input_file, '-lsoa.csv'))
# Extract the region of interest to csv
subset = data['geography'].str.startswith('South Glouc')
data[subset].to_csv(output_file, index=False)
def extract_census_msoa_metric(input_file: Path, output_file: Path):
"""
Extract MSOA-level 2021 Census data from the csv `input_file` that is relevant to our area of
interest (South Gloucestershire) and save it to the csv `output_file`.
"""
# Load the relevant size of geography from the input zip file
data = pd.read_csv(extract_from_zip(input_file, '-msoa.csv'))
# Extract the region of interest to csv
# (the MSOAs get resampled to LSOAs at a later stage)
subset = data['geography'].str.startswith('South Glouc')
data[subset].to_csv(output_file, index=False)
def extract_imd_lsoa_2011_metric(input_file: Path, output_file: Path):
"""
Extract LSOA-level English indices of deprivation data from the Excel `input_file` that is
relevant to our area of interest (South Gloucestershire) and save it to the csv `output_file`.
Note: LSOA boundaries from the 2011 Census are used.
"""
# Extract all sheets from the source Excel document
data = pd.read_excel(input_file, sheet_name=None)
# Combine columns from the individual worksheets
data = (
pd.concat(
[
sheet_data.set_index(['LSOA code (2011)', 'LSOA name (2011)', 'Local Authority District code (2019)', 'Local Authority District name (2019)'])
for sheet_name, sheet_data in data.items()
if sheet_name.startswith('IoD2019 ')
],
axis=1,
)
.reset_index()
)
# Extract the region of interest to csv
# (the 2011 LSOAs get mapped to 2021 LSOAs at a later stage)
subset = data['LSOA name (2011)'].str.startswith('South Glouc')
data[subset].to_csv(output_file, index=False)
def extract_geodata_region(input_file: Path, output_file: Path):
"""
Extract LSOAs/MSOAs boundaries from the Shapefile `input_file` that are relevant to our area of
interest (South Gloucestershire) and save them to the Shapefile `output_file`.
"""
data = gpd.read_file(input_file)
if 'MSOA21NM' in data:
subset = data['MSOA21NM'].str.startswith('South Glouc')
elif 'LSOA21NM' in data:
subset = data['LSOA21NM'].str.startswith('South Glouc')
data[subset].to_file(output_file)
if __name__ == '__main__':
# Ensure that the directories we are outputting data to exist
config.subset_data_dir.mkdir(parents=True, exist_ok=True)
# Filter data relating to our area of interest (South Gloucestershire) using the appropriate
# method, according to the data type defined in the `data_sources.yaml` config
for data_source in config.data_sources:
print(data_source['name'])
if data_source['type'] == 'census_lsoa_metric':
extract_census_lsoa_metric(
input_file=config.raw_data_dir / config.get_download_filename(data_source),
output_file=config.subset_data_dir / (data_source['name'] + '.csv'),
)
elif data_source['type'] == 'census_msoa_metric':
extract_census_msoa_metric(
input_file=config.raw_data_dir / config.get_download_filename(data_source),
output_file=config.subset_data_dir / (data_source['name'] + '.csv'),
)
elif data_source['type'] == 'imd_lsoa_2011_metric':
extract_imd_lsoa_2011_metric(
input_file=config.raw_data_dir / config.get_download_filename(data_source),
output_file=config.subset_data_dir / (data_source['name'] + '.csv'),
)
elif data_source['type'] == 'geodata':
extract_geodata_region(
input_file=config.raw_data_dir / config.get_download_filename(data_source),
output_file=config.subset_data_dir / (data_source['name'] + '.shp'),
)
else:
print('(skipped)')