diff --git a/code/01_baseline/04_preprocess_english_house_price.ipynb b/code/01_baseline/04_preprocess_english_house_price.ipynb new file mode 100644 index 0000000..4b15333 --- /dev/null +++ b/code/01_baseline/04_preprocess_english_house_price.ipynb @@ -0,0 +1,131 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f4eff720-400d-4388-9bb9-689c514fa853", + "metadata": {}, + "source": [ + "# Preprocess house price\n", + "\n", + "Get house price per OA across England as a Parquet file\n", + "\n", + "## A new attribute-linked residential property price dataset for England and Wales 2011-2019\n", + "\n", + "We can use https://reshare.ukdataservice.ac.uk/854942/ that derived a price per sqm from link between sales data and EPC." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "7afeffbf-f1ed-43da-bc1b-61a457429095", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "aa061ba2-0eaf-4737-beac-bc4f54c2de90", + "metadata": {}, + "outputs": [], + "source": [ + "# It is 6.3GB...\n", + "linked_epc_path = \"https://reshare.ukdataservice.ac.uk/854942/1/tranall2011_19.csv\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "99c06f6a-18b4-444d-a21e-bb7bd414ec14", + "metadata": {}, + "source": [ + "Filter only Tyne and Wear for the last two years (2018-19)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4517557e-be6f-46fe-9bcc-1675e6098343", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "epc = pd.read_csv(linked_epc_path)\n", + "epc[\"dateoftransfer\"] = pd.to_datetime(epc.dateoftransfer)\n", + "last2years = epc[epc.dateoftransfer > datetime.datetime(2018, 1, 1)]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "283843a2-78ac-4549-9905-76d4f7f3f0fa", + "metadata": {}, + "source": [ + "Link to geometry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc1e3a99-a0b5-4f34-bca1-d63da9dbec1f", + "metadata": {}, + "outputs": [], + "source": [ + "oa = gpd.read_file( \"https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011_clipped.zip\", engine=\"pyogrio\"\n", + ")\n", + "agg = (\n", + " last2years[[\"oa11\", \"priceper\", \"numberrooms\", \"price\", \"tfarea\"]]\n", + " .groupby(\"oa11\")\n", + " .mean()\n", + " .reset_index()\n", + ")\n", + "agg_oa = oa.merge(agg, left_on=\"geo_code\", right_on=\"oa11\", how=\"inner\")\n", + "agg_oa" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0897d793-c5a6-49ca-a321-be31dd595bb8", + "metadata": {}, + "outputs": [], + "source": [ + "agg_oa.to_file(\n", + " f\"{data_folder}/processed/house_prices/price_per_sqm.gpkg\", engine=\"pyogrio\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "582514fb-515d-49e1-b554-d6fa00687338", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}