vulnerability/code/02_extract_and_subset_data.py

"""
Extract only data of interest (relating to South Gloucestershire) from zip and Shapefiles.
"""

from pathlib import Path
import typing
from zipfile import ZipFile

import pandas as pd
import geopandas as gpd

import config


def extract_from_zip(zip_path: Path, file_suffix: str) -> typing.TextIO:
    """
    Extract the first file whose filename ends in `file_suffix` from the zip file at `zip_path` and
    return it as a file object.
    """
    with ZipFile(zip_path) as zip_file:
        for compressed_file in zip_file.filelist:
            if compressed_file.filename.startswith('__MACOSX'):
                # Skip non-data folders
                continue
            if compressed_file.filename.endswith(file_suffix):
                return zip_file.open(compressed_file)
        else:
            raise KeyError(f'File ending in "{file_suffix}" not found in {zip_path}')


def extract_census_lsoa_metric(input_file: Path, output_file: Path):
    """
    Extract LSOA-level 2021 Census data from the csv `input_file` that is relevant to our area of
    interest (South Gloucestershire) and save it to the csv `output_file`.
    """
    # Load the relevant size of geography from the input zip file
    data = pd.read_csv(extract_from_zip(input_file, '-lsoa.csv'))
    
    # Extract the region of interest to csv
    subset = data['geography'].str.startswith('South Glouc')
    data[subset].to_csv(output_file, index=False)

    
def extract_census_msoa_metric(input_file: Path, output_file: Path):
    """
    Extract MSOA-level 2021 Census data from the csv `input_file` that is relevant to our area of
    interest (South Gloucestershire) and save it to the csv `output_file`.
    """
    # Load the relevant size of geography from the input zip file
    data = pd.read_csv(extract_from_zip(input_file, '-msoa.csv'))
    
    # Extract the region of interest to csv
    # (the MSOAs get resampled to LSOAs at a later stage)
    subset = data['geography'].str.startswith('South Glouc')
    data[subset].to_csv(output_file, index=False)
    

def extract_imd_lsoa_2011_metric(input_file: Path, output_file: Path):
    """
    Extract LSOA-level English indices of deprivation data from the Excel `input_file` that is
    relevant to our area of interest (South Gloucestershire) and save it to the csv `output_file`.

    Note: LSOA boundaries from the 2011 Census are used.
    """
    # Extract all sheets from the source Excel document
    data = pd.read_excel(input_file, sheet_name=None)

    # Combine columns from the individual worksheets
    data = (
        pd.concat(
            [
                sheet_data.set_index(['LSOA code (2011)', 'LSOA name (2011)', 'Local Authority District code (2019)', 'Local Authority District name (2019)'])
                for sheet_name, sheet_data in data.items()
                if sheet_name.startswith('IoD2019 ')
            ],
            axis=1,
        )
        .reset_index()
    )
    
    # Extract the region of interest to csv
    # (the 2011 LSOAs get mapped to 2021 LSOAs at a later stage)
    subset = data['LSOA name (2011)'].str.startswith('South Glouc')
    data[subset].to_csv(output_file, index=False)


def extract_geodata_region(input_file: Path, output_file: Path):
    """
    Extract LSOAs/MSOAs boundaries from the Shapefile `input_file` that are relevant to our area of
    interest (South Gloucestershire) and save them to the Shapefile `output_file`.
    """
    data = gpd.read_file(input_file)
    if 'MSOA21NM' in data:
        subset = data['MSOA21NM'].str.startswith('South Glouc')
    elif 'LSOA21NM' in data:
        subset = data['LSOA21NM'].str.startswith('South Glouc')
    data[subset].to_file(output_file)


if __name__ == '__main__':
    # Ensure that the directories we are outputting data to exist
    config.subset_data_dir.mkdir(parents=True, exist_ok=True)

    # Filter data relating to our area of interest (South Gloucestershire) using the appropriate
    # method, according to the data type defined in the `data_sources.yaml` config
    for data_source in config.data_sources:
        print(data_source['name'])
        
        if data_source['type'] == 'census_lsoa_metric':
            extract_census_lsoa_metric(
                input_file=config.raw_data_dir / config.get_download_filename(data_source),
                output_file=config.subset_data_dir / (data_source['name'] + '.csv'),
            )
        elif data_source['type'] == 'census_msoa_metric':
            extract_census_msoa_metric(
                input_file=config.raw_data_dir / config.get_download_filename(data_source),
                output_file=config.subset_data_dir / (data_source['name'] + '.csv'),
            )
        elif data_source['type'] == 'imd_lsoa_2011_metric':
            extract_imd_lsoa_2011_metric(
                input_file=config.raw_data_dir / config.get_download_filename(data_source),
                output_file=config.subset_data_dir / (data_source['name'] + '.csv'),
            )
        elif data_source['type'] == 'geodata':
            extract_geodata_region(
                input_file=config.raw_data_dir / config.get_download_filename(data_source),
                output_file=config.subset_data_dir / (data_source['name'] + '.shp'),
            )
        else:
            print('(skipped)')