diff --git a/code/04_generalisation/test_spatial_units.ipynb b/code/04_generalisation/test_spatial_units.ipynb new file mode 100644 index 0000000..3941970 --- /dev/null +++ b/code/04_generalisation/test_spatial_units.ipynb @@ -0,0 +1,2202 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "dbfa2951-ee8a-4671-bd58-41a4d3e78ad6", + "metadata": {}, + "source": [ + "# Test spatial units in the modelling\n", + "\n", + "This tests performance of different core spatial units (grids, OA, LSOA) in the modelling to figure out the best unit to be used internally. \n", + "\n", + "Units to compare:\n", + "\n", + "- 2011 output areas (the data we have are linked to 2011 census, not 2021)\n", + "- square grid 100m (mimicking OSGB but not being aligned for simplicity of this exercise)\n", + "- H3 grid at resolution 9\n", + "- Enclosed tessellation cells\n", + "\n", + "Targets to compare:\n", + "\n", + "- Air pollution\n", + "- House price\n", + "\n", + "Models to compare:\n", + "\n", + "- distance band weights using fuzzy\n", + "- contiguity of order 5\n", + "\n", + "Geographic locations to compare:\n", + "\n", + "- Leeds (chunk 40)\n", + "- Newcastle (chunk 26)\n", + "\n", + "The final models shall be trained on the England-wide data. This trains only on each AOI." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "28c8045d-3962-4413-826e-a3d6985979cc", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import geopandas as gpd\n", + "import xarray as xr\n", + "import tobler\n", + "import libpysal\n", + "import numpy as np\n", + "from itertools import product\n", + "import esda\n", + "\n", + "from sklearn.ensemble import HistGradientBoostingRegressor\n", + "from sklearn import metrics\n", + "from tobler.util import h3fy" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "872d1ec6-0a52-4220-9406-01204baee367", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "data_folder = \"/Users/martin/Library/CloudStorage/OneDrive-SharedLibraries-TheAlanTuringInstitute/Daniel Arribas-Bel - demoland_data\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8705ba0e-cdb2-44fa-9726-81b0d46ba1a5", + "metadata": {}, + "outputs": [], + "source": [ + "chunks = gpd.read_parquet(f\"{data_folder}/raw/urban_morpho/local_auth_chunks.pq\")" + ] + }, + { + "cell_type": "markdown", + "id": "250eac73-8875-4482-a49b-7b0dff629bb9", + "metadata": {}, + "source": [ + "- get air\n", + "- for chunk\n", + " - for geom\n", + " - generate/load geoms\n", + " - interpolate air\n", + " - interpolate price\n", + " - interpolate explanatory\n", + " - save table\n", + " - for weights\n", + " - create weights\n", + " - get lags\n", + " - fit the model\n", + " - save sklearn model\n", + " - save demoland model\n", + " - save some performance data" + ] + }, + { + "cell_type": "markdown", + "id": "56c1e5f0-cb13-46fa-9f82-0aae30054552", + "metadata": {}, + "source": [ + "## Get Air Pollution data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e9a1e901-f63f-4f89-a4f8-cb944658a586", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "pm10_21 = (\n", + " pd.read_csv(\n", + " \"https://uk-air.defra.gov.uk/datastore/pcm/mappm102021g.csv\",\n", + " header=5,\n", + " na_values=[\"MISSING\"],\n", + " )\n", + " .set_index([\"x\", \"y\"])\n", + " .drop(columns=\"gridcode\")\n", + " .to_xarray()\n", + ")\n", + "pm25_21 = (\n", + " pd.read_csv(\n", + " \"https://uk-air.defra.gov.uk/datastore/pcm/mappm252021g.csv\",\n", + " header=5,\n", + " na_values=[\"MISSING\"],\n", + " )\n", + " .set_index([\"x\", \"y\"])\n", + " .drop(columns=\"gridcode\")\n", + " .to_xarray()\n", + ")\n", + "no2_21 = (\n", + " pd.read_csv(\n", + " \"https://uk-air.defra.gov.uk/datastore/pcm/mapno22021.csv\",\n", + " header=5,\n", + " na_values=[\"MISSING\"],\n", + " )\n", + " .set_index([\"x\", \"y\"])\n", + " .drop(columns=\"gridcode\")\n", + " .to_xarray()\n", + ")\n", + "so2_21 = (\n", + " pd.read_csv(\n", + " \"https://uk-air.defra.gov.uk/datastore/pcm/mapso22021.csv\",\n", + " header=5,\n", + " na_values=[\"MISSING\"],\n", + " )\n", + " .set_index([\"x\", \"y\"])\n", + " .drop(columns=\"gridcode\")\n", + " .to_xarray()\n", + ")\n", + "pollutants_2021 = xr.merge([pm10_21, pm25_21, no2_21, so2_21])\n", + "aqi = (\n", + " pollutants_2021.pm252021g\n", + " + pollutants_2021.pm102021g / 2\n", + " + pollutants_2021.no22021 / 4\n", + " + pollutants_2021.so22021 / 10\n", + ")\n", + "pollutants_2021 = pollutants_2021.assign(aqi=aqi)" + ] + }, + { + "cell_type": "markdown", + "id": "84a75ffa-4484-4d31-bbae-add25fe74976", + "metadata": {}, + "source": [ + "## Get House Price data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bc7b0176-035d-45bf-94e1-94e8ed9065ed", + "metadata": {}, + "outputs": [], + "source": [ + "house_prices = gpd.read_parquet(f\"{data_folder}/processed/house_prices/price_per_sqm_england.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "a5d6398c-676f-4137-bc3e-e932253d90a5", + "metadata": {}, + "source": [ + "## Get population data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b7fdd69a-3bc8-476b-9a2d-636cd55b49ae", + "metadata": {}, + "outputs": [], + "source": [ + "pop = gpd.read_parquet(f\"{data_folder}/processed/gb_population_estimates.pq\")" + ] + }, + { + "cell_type": "markdown", + "id": "a9fa38a0-34e8-4a6f-bf4d-d4751b301bae", + "metadata": {}, + "source": [ + "## Get workplace data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7dbd4371-cd39-426b-b3a3-4214a66f7ad3", + "metadata": {}, + "outputs": [], + "source": [ + "wp = gpd.read_parquet(\n", + " f\"{data_folder}/raw/workplace_population/workplace_by_industry_gb.pq\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "7e307521-92fd-4d01-be12-a3d5c5cc4c99", + "metadata": {}, + "source": [ + "## Get CORINE data" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "0f4fd505-6641-4a5a-a885-5d4fe7955235", + "metadata": {}, + "outputs": [], + "source": [ + "corine = gpd.read_parquet(f\"{data_folder}/raw/land_cover/corine_gb.pq\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "317bafbe-7eeb-4396-b786-97b052a1c5b0", + "metadata": {}, + "outputs": [], + "source": [ + "corine_names = {\n", + " \"Code_18_124\": \"Land cover [Airports]\",\n", + " \"Code_18_211\": \"Land cover [Non-irrigated arable land]\",\n", + " \"Code_18_121\": \"Land cover [Industrial or commercial units]\",\n", + " \"Code_18_421\": \"Land cover [Salt marshes]\",\n", + " \"Code_18_522\": \"Land cover [Estuaries]\",\n", + " \"Code_18_142\": \"Land cover [Sport and leisure facilities]\",\n", + " \"Code_18_141\": \"Land cover [Green urban areas]\",\n", + " \"Code_18_112\": \"Land cover [Discontinuous urban fabric]\",\n", + " \"Code_18_231\": \"Land cover [Pastures]\",\n", + " \"Code_18_311\": \"Land cover [Broad-leaved forest]\",\n", + " \"Code_18_131\": \"Land cover [Mineral extraction sites]\",\n", + " \"Code_18_123\": \"Land cover [Port areas]\",\n", + " \"Code_18_122\": \"Land cover [Road and rail networks and associated land]\",\n", + " \"Code_18_512\": \"Land cover [Water bodies]\",\n", + " \"Code_18_243\": \"Land cover [Land principally occupied by agriculture, with significant areas of natural vegetation]\",\n", + " \"Code_18_313\": \"Land cover [Mixed forest]\",\n", + " \"Code_18_412\": \"Land cover [Peat bogs]\",\n", + " \"Code_18_321\": \"Land cover [Natural grasslands]\",\n", + " \"Code_18_322\": \"Land cover [Moors and heathland]\",\n", + " \"Code_18_324\": \"Land cover [Transitional woodland-shrub]\",\n", + " \"Code_18_111\": \"Land cover [Continuous urban fabric]\",\n", + " \"Code_18_423\": \"Land cover [Intertidal flats]\",\n", + " \"Code_18_523\": \"Land cover [Sea and ocean]\",\n", + " \"Code_18_312\": \"Land cover [Coniferous forest]\",\n", + " \"Code_18_133\": \"Land cover [Construction sites]\",\n", + " \"Code_18_333\": \"Land cover [Sparsely vegetated areas]\",\n", + " \"Code_18_332\": \"Land cover [Bare rocks]\",\n", + " \"Code_18_411\": \"Land cover [Inland marshes]\",\n", + " \"Code_18_132\": \"Land cover [Dump sites]\",\n", + " \"Code_18_222\": \"Land cover [Fruit trees and berry plantations]\",\n", + " \"Code_18_242\": \"Land cover [Complex cultivation patterns]\",\n", + " \"Code_18_331\": \"Land cover [Beaches, dunes, sands]\",\n", + " \"Code_18_511\": \"Land cover [Water courses]\",\n", + " \"Code_18_334\": \"Land cover [Burnt areas]\",\n", + " \"Code_18_244\": \"Land cover [Agro-forestry areas]\",\n", + " \"Code_18_521\": \"Land cover [Coastal lagoons]\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "74ad1a79", + "metadata": {}, + "source": [ + "Evaluation script" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "bc39f040", + "metadata": {}, + "outputs": [], + "source": [ + "meta = {}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "76cdbb94", + "metadata": {}, + "outputs": [], + "source": [ + "def fit_and_eval(geoms, place, geom_type):\n", + " \"\"\"Fit the model and evaluate each fold\n", + "\n", + " Parameters\n", + " ----------\n", + " geoms : GeoDataFrame\n", + " gdf with everything\n", + " place : str\n", + " name of a place\n", + " geom_type : str\n", + " name of a geom type\n", + " \"\"\"\n", + " meta[place][geom_type][\"air\"] = {}\n", + " meta[place][geom_type][\"hp\"] = {}\n", + " for loop in range(4):\n", + " meta[place][geom_type][\"air\"][f\"loop_{loop}\"] = {}\n", + " meta[place][geom_type][\"hp\"][f\"loop_{loop}\"] = {}\n", + "\n", + " # avoid special treatment for HP and AQ\n", + " geoms[\"house_price_index\"] = geoms[\"house_price_index\"].replace(0, np.nan)\n", + " geoms = geoms.dropna(subset=\"house_price_index\")\n", + "\n", + " mask = geoms[\"split\"]==loop\n", + " train = geoms[~mask]\n", + " test = geoms[mask]\n", + " if geom_type == \"oa\":\n", + " W_train = libpysal.weights.fuzzy_contiguity(train.reset_index(), buffering=True, buffer=2000)\n", + " W_test = libpysal.weights.fuzzy_contiguity(test.reset_index(), buffering=True, buffer=2000)\n", + " else:\n", + " W_train = libpysal.weights.DistanceBand.from_dataframe(train.centroid.reset_index(), 2000)\n", + " W_test = libpysal.weights.DistanceBand.from_dataframe(test.centroid.reset_index(), 2000)\n", + "\n", + " no_exvars =[\n", + " \"geometry\",\n", + " \"air_quality_index\",\n", + " \"house_price_index\",\n", + " ]\n", + " exvars_train = train.drop(columns=no_exvars)\n", + " exvars_test = test.drop(columns=no_exvars)\n", + "\n", + " W_train.transform = \"r\"\n", + " W_test.transform = \"r\"\n", + " for col in exvars_train.columns.copy():\n", + " exvars_train[f\"{col}_lag\"] = libpysal.weights.spatial_lag.lag_spatial(W_train, exvars_train[col])\n", + " exvars_test[f\"{col}_lag\"] = libpysal.weights.spatial_lag.lag_spatial(W_test, exvars_test[col])\n", + "\n", + " # Air pollution\n", + " regressor_air = HistGradientBoostingRegressor(\n", + " random_state=0, max_bins=64, max_iter=1000\n", + " )\n", + "\n", + " regressor_air.fit(exvars_train, train.air_quality_index)\n", + " pred = regressor_air.predict(exvars_test)\n", + " residuals = test.air_quality_index - pred\n", + "\n", + " meta[place][geom_type][\"air\"][f\"loop_{loop}\"][\"mse\"] = metrics.mean_squared_error(test.air_quality_index, pred)\n", + " meta[place][geom_type][\"air\"][f\"loop_{loop}\"][\"me\"] = residuals.abs().mean()\n", + " meta[place][geom_type][\"air\"][f\"loop_{loop}\"][\"r2\"] = metrics.r2_score(test.air_quality_index, pred)\n", + " moran_obs = esda.Moran(test.air_quality_index, W_test)\n", + " moran_pred = esda.Moran(pred, W_test)\n", + " meta[place][geom_type][\"air\"][f\"loop_{loop}\"][\"moran_obs\"] = moran_obs.I\n", + " meta[place][geom_type][\"air\"][f\"loop_{loop}\"][\"moran_pred\"] = moran_pred.I\n", + "\n", + "\n", + " # House price\n", + "\n", + " regressor_hp = HistGradientBoostingRegressor(\n", + " random_state=0, max_bins=64, max_iter=1000\n", + " )\n", + "\n", + " regressor_hp.fit(exvars_train, np.log(train.house_price_index))\n", + " pred = regressor_air.predict(exvars_test)\n", + " residuals = np.log(test.house_price_index) - pred\n", + "\n", + " meta[place][geom_type][\"hp\"][f\"loop_{loop}\"][\"mse\"] = metrics.mean_squared_error(np.log(test.house_price_index), pred)\n", + " meta[place][geom_type][\"hp\"][f\"loop_{loop}\"][\"me\"] = residuals.abs().mean()\n", + " meta[place][geom_type][\"hp\"][f\"loop_{loop}\"][\"r2\"] = metrics.r2_score(np.log(test.house_price_index), pred)\n", + " moran_obs = esda.Moran(np.log(test.house_price_index), W_test)\n", + " moran_pred = esda.Moran(pred, W_test)\n", + " meta[place][geom_type][\"hp\"][f\"loop_{loop}\"][\"moran_obs\"] = moran_obs.I\n", + " meta[place][geom_type][\"hp\"][f\"loop_{loop}\"][\"moran_pred\"] = moran_pred.I" + ] + }, + { + "cell_type": "markdown", + "id": "5a8ae977-bd80-4309-a9c4-225633125c2d", + "metadata": {}, + "source": [ + "## Leeds\n", + "\n", + "#### Prep Leeds data" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "3f6d1fe7", + "metadata": {}, + "outputs": [], + "source": [ + "opt_id = 40\n", + "opt = \"leeds\"\n", + "\n", + "chunk = chunks.loc[[opt_id]]\n", + "meta[opt] = {}" + ] + }, + { + "cell_type": "markdown", + "id": "616e5f6e", + "metadata": {}, + "source": [ + "Prepare Air pollution data" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "75e6aacc", + "metadata": {}, + "outputs": [], + "source": [ + "bds = chunk.buffer(1000).total_bounds\n", + "pollutants_aoi = pollutants_2021.sel(\n", + " x=slice(bds[0], bds[2]), y=slice(bds[1], bds[3])\n", + ")\n", + "pollutants_aoi_df = pollutants_aoi.to_dataframe().reset_index()\n", + "pollutants_aoi_df = gpd.GeoDataFrame(\n", + " pollutants_aoi_df,\n", + " geometry=gpd.points_from_xy(\n", + " pollutants_aoi_df.x, pollutants_aoi_df.y, crs=27700\n", + " ).buffer(500, cap_style=3),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "04e41d0e", + "metadata": {}, + "source": [ + "Get CV splits" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "3e80928a", + "metadata": {}, + "outputs": [], + "source": [ + "cv_ids = np.tile(np.arange(5), pollutants_aoi_df.shape[0]//5 + 1)[:pollutants_aoi_df.shape[0]]\n", + "rng = np.random.default_rng()\n", + "rng.shuffle(cv_ids)\n", + "pollutants_aoi_df[\"split\"] = cv_ids" + ] + }, + { + "cell_type": "markdown", + "id": "a2af2271", + "metadata": {}, + "source": [ + "Prepare house price data" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "f1805d3a", + "metadata": {}, + "outputs": [], + "source": [ + "house_prices_aoi = house_prices.iloc[house_prices.sindex.query(chunk.geometry.item())]" + ] + }, + { + "cell_type": "markdown", + "id": "3a7f0e05", + "metadata": {}, + "source": [ + "Prepare population data" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "fc8ffc20", + "metadata": {}, + "outputs": [], + "source": [ + "pop_aoi = pop[pop.code.isin(house_prices_aoi.code)]" + ] + }, + { + "cell_type": "markdown", + "id": "69f36a61", + "metadata": {}, + "source": [ + "Link population and house price (both are on OA)." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f121ab98", + "metadata": {}, + "outputs": [], + "source": [ + "pop_hp = house_prices_aoi.merge(pop_aoi[[\"code\", \"population\"]], on=\"code\")" + ] + }, + { + "cell_type": "markdown", + "id": "05cd56ff", + "metadata": {}, + "source": [ + " Prepare workplace pop data" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "5ea3f290", + "metadata": {}, + "outputs": [], + "source": [ + "wp_aoi = wp.iloc[wp.sindex.query(chunk.geometry.item())]" + ] + }, + { + "cell_type": "markdown", + "id": "84ff417f", + "metadata": {}, + "source": [ + "Prepare CORINE data" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "9385fdf3", + "metadata": {}, + "outputs": [], + "source": [ + "corine_aoi = corine.iloc[corine.sindex.query(chunk.geometry.item())]" + ] + }, + { + "cell_type": "markdown", + "id": "dc2e8e7b", + "metadata": {}, + "source": [ + "Get morphometric data" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "72ea9f0c", + "metadata": {}, + "outputs": [], + "source": [ + "data = gpd.read_parquet(f\"{data_folder}/raw/urban_morpho/cells_{opt_id}.pq\")" + ] + }, + { + "cell_type": "markdown", + "id": "cae9f052", + "metadata": {}, + "source": [ + "### H3\n", + "Create geometries" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "60c06543", + "metadata": {}, + "outputs": [], + "source": [ + "meta[opt][\"h3\"] = {}" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "5b014423", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/pyproj/crs/crs.py:1293: UserWarning: You will likely lose important projection information when converting to a PROJ string from another format. See: https://proj.org/faq.html#what-is-the-best-format-for-describing-coordinate-reference-systems\n", + " proj = self._crs.to_proj4(version=version)\n" + ] + } + ], + "source": [ + "geoms = h3fy(chunk, resolution=9, buffer=False)" + ] + }, + { + "cell_type": "markdown", + "id": "9ebf3bbb", + "metadata": {}, + "source": [ + "Interpolate Air Quality" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "acafe6bb", + "metadata": {}, + "outputs": [], + "source": [ + "interp = tobler.area_weighted.area_interpolate(pollutants_aoi_df, geoms, intensive_variables=[\"aqi\"])\n", + "geoms[\"air_quality_index\"] = interp.aqi.values" + ] + }, + { + "cell_type": "markdown", + "id": "c5522bef", + "metadata": {}, + "source": [ + " Interpolate OA data" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "25a321c1", + "metadata": {}, + "outputs": [], + "source": [ + "interp_oa = tobler.area_weighted.area_interpolate(\n", + " pop_hp,\n", + " geoms,\n", + " intensive_variables=[\"priceper\"],\n", + " extensive_variables=[\"population\"],\n", + ")\n", + "geoms[\"house_price_index\"] = interp_oa.priceper.values\n", + "geoms[\"population\"] = interp_oa.population.values" + ] + }, + { + "cell_type": "markdown", + "id": "2fe710f2", + "metadata": {}, + "source": [ + "Interpolate workplace population" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "21f930ea", + "metadata": {}, + "outputs": [], + "source": [ + "wp_interpolated = tobler.area_weighted.area_interpolate(\n", + " wp_aoi,\n", + " geoms,\n", + " extensive_variables=[\n", + " \"A, B, D, E. Agriculture, energy and water\",\n", + " \"C. Manufacturing\",\n", + " \"F. Construction\",\n", + " \"G, I. Distribution, hotels and restaurants\",\n", + " \"H, J. Transport and communication\",\n", + " \"K, L, M, N. Financial, real estate, professional and administrative activities\",\n", + " \"O,P,Q. Public administration, education and health\",\n", + " \"R, S, T, U. Other\",\n", + " ],\n", + ")\n", + "\n", + "geoms[\n", + " [\n", + " \"A, B, D, E. Agriculture, energy and water\",\n", + " \"C. Manufacturing\",\n", + " \"F. Construction\",\n", + " \"G, I. Distribution, hotels and restaurants\",\n", + " \"H, J. Transport and communication\",\n", + " \"K, L, M, N. Financial, real estate, professional and administrative activities\",\n", + " \"O,P,Q. Public administration, education and health\",\n", + " \"R, S, T, U. Other\",\n", + " ]\n", + "] = wp_interpolated.drop(columns=\"geometry\").values" + ] + }, + { + "cell_type": "markdown", + "id": "41f492ab", + "metadata": {}, + "source": [ + "Interpolate CORINE" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "e8811af5", + "metadata": {}, + "outputs": [], + "source": [ + "corine_interpolated = tobler.area_weighted.area_interpolate(\n", + " corine_aoi, geoms, categorical_variables=[\"Code_18\"]\n", + ")\n", + "corine_interpolated.columns = corine_interpolated.columns.map(corine_names)\n", + "interesting = [\n", + " \"Land cover [Discontinuous urban fabric]\",\n", + " \"Land cover [Continuous urban fabric]\",\n", + " \"Land cover [Non-irrigated arable land]\",\n", + " \"Land cover [Industrial or commercial units]\",\n", + " \"Land cover [Green urban areas]\",\n", + " \"Land cover [Pastures]\",\n", + " \"Land cover [Sport and leisure facilities]\",\n", + "]\n", + "geoms[interesting] = corine_interpolated[interesting].values" + ] + }, + { + "cell_type": "markdown", + "id": "ff62d9dc", + "metadata": {}, + "source": [ + "Interpolate morphometrics" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "554b3bd1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: sdbAre, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: sdbCoA, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: ssbCCo, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: ssbCor, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: ssbSqu, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: ssbERI, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: ssbCCM, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: ssbCCD, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: stbOri, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: sicCAR, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: stbCeA, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: mtbAli, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: mtbNDi, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: stbSAl, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n" + ] + } + ], + "source": [ + "chars = data.columns.drop(\n", + " [\n", + " \"hindex\",\n", + " \"tessellation\",\n", + " \"buildings\",\n", + " \"nodeID\",\n", + " \"edgeID_keys\",\n", + " \"edgeID_values\",\n", + " \"edgeID_primary\",\n", + " \"sdbPer\",\n", + " \"ssbElo\",\n", + " \"stcOri\",\n", + " \"sdcLAL\",\n", + " \"mdcAre\",\n", + " \"ltcAre\",\n", + " \"ltcWRE\",\n", + " \"mtdMDi\",\n", + " \"lcdMes\",\n", + " \"lddNDe\",\n", + " \"sddAre\",\n", + " \"mdsAre\",\n", + " \"ldsAre\",\n", + " \"lisCel\",\n", + " \"ldePer\",\n", + " \"lseCWA\",\n", + " ]\n", + ")\n", + "morhp_interpolated = tobler.area_weighted.area_interpolate(\n", + " data, geoms, intensive_variables=chars.tolist()\n", + ")\n", + "\n", + "geoms[morhp_interpolated.columns.drop(\"geometry\")] = morhp_interpolated.drop(\n", + " columns=\"geometry\"\n", + ").values" + ] + }, + { + "cell_type": "markdown", + "id": "5970c2bb", + "metadata": {}, + "source": [ + "Get split IDs" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "74ce53f2", + "metadata": {}, + "outputs": [], + "source": [ + "geoms_ix, poll_ix = pollutants_aoi_df.sindex.query(geoms.centroid, predicate=\"within\", sort=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "b4e8d7de", + "metadata": {}, + "outputs": [], + "source": [ + "geoms[\"split\"] = pollutants_aoi_df[\"split\"].values[poll_ix]" + ] + }, + { + "cell_type": "markdown", + "id": "f5ed03e4", + "metadata": {}, + "source": [ + "Save the table" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "28ea8ea3", + "metadata": {}, + "outputs": [], + "source": [ + "geoms.to_parquet(f\"{data_folder}/unit_test/tables/h3_{opt}.pq\")" + ] + }, + { + "cell_type": "markdown", + "id": "5f8b9e50", + "metadata": {}, + "source": [ + "Loop over splits, get W, train, eval, save results." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "81625fd0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: \n", + " There are 16 disconnected components.\n", + " warnings.warn(message)\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/geopandas/geodataframe.py:1543: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " super().__setitem__(key, value)\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: \n", + " There are 15 disconnected components.\n", + " warnings.warn(message)\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: \n", + " There are 11 disconnected components.\n", + " warnings.warn(message)\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: \n", + " There are 10 disconnected components.\n", + " warnings.warn(message)\n" + ] + } + ], + "source": [ + "fit_and_eval(geoms, opt, \"h3\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "7b0cb13c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'leeds': {'h3': {'air': {'loop_0': {'mse': 1.0215962668214376,\n", + " 'me': 0.7268292438504929,\n", + " 'r2': 0.6471124647540352,\n", + " 'moran_obs': 0.9139658896608336,\n", + " 'moran_pred': 0.8911782193252566},\n", + " 'loop_1': {'mse': 0.9052719580939292,\n", + " 'me': 0.7173481454198483,\n", + " 'r2': 0.7214704719652031,\n", + " 'moran_obs': 0.9382819221632059,\n", + " 'moran_pred': 0.9108587337675578},\n", + " 'loop_2': {'mse': 1.189713977353201,\n", + " 'me': 0.7906349848032027,\n", + " 'r2': 0.6480546399551688,\n", + " 'moran_obs': 0.9270422555656327,\n", + " 'moran_pred': 0.9186198786505408},\n", + " 'loop_3': {'mse': 0.9411979683024067,\n", + " 'me': 0.7241649062582691,\n", + " 'r2': 0.7489344824312717,\n", + " 'moran_obs': 0.9386609332461856,\n", + " 'moran_pred': 0.9026754574770531}},\n", + " 'hp': {'loop_0': {'mse': 45.97914682149453,\n", + " 'me': 6.607405179022031,\n", + " 'r2': -818.245515704516,\n", + " 'moran_obs': 0.7485176481450501,\n", + " 'moran_pred': 0.8911782193252566},\n", + " 'loop_1': {'mse': 48.6391071221501,\n", + " 'me': 6.802131639333218,\n", + " 'r2': -784.5792801815664,\n", + " 'moran_obs': 0.7380520767953878,\n", + " 'moran_pred': 0.9108587337675578},\n", + " 'loop_2': {'mse': 48.506456511887656,\n", + " 'me': 6.763358828565924,\n", + " 'r2': -899.4290133356027,\n", + " 'moran_obs': 0.7517514698021434,\n", + " 'moran_pred': 0.9186198786505408},\n", + " 'loop_3': {'mse': 49.32745556540463,\n", + " 'me': 6.814134839974605,\n", + " 'r2': -890.4742353349244,\n", + " 'moran_obs': 0.7737351301281735,\n", + " 'moran_pred': 0.9026754574770531}}}}}" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta" + ] + }, + { + "cell_type": "markdown", + "id": "fefff76e", + "metadata": {}, + "source": [ + "### square grid\n", + "Create geometries" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "204d02c7", + "metadata": {}, + "outputs": [], + "source": [ + "meta[\"leeds\"][\"square\"] = {}" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "c93bda26", + "metadata": {}, + "outputs": [], + "source": [ + "coords = np.array(\n", + " list(product(\n", + " np.arange(bds[0], bds[2], 250),\n", + " np.arange(bds[1], bds[3], 250),\n", + " ))\n", + ")\n", + "points = gpd.GeoSeries.from_xy(coords[:, 0], coords[:, 1], crs=27700)\n", + "geoms = points.iloc[points.sindex.query(chunk.geometry.item(), predicate=\"contains\")].buffer(100, cap_style=3).to_frame(\"geometry\")" + ] + }, + { + "cell_type": "markdown", + "id": "8a45de94", + "metadata": {}, + "source": [ + "Interpolate Air Quality" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "b66b8367", + "metadata": {}, + "outputs": [], + "source": [ + "interp = tobler.area_weighted.area_interpolate(pollutants_aoi_df, geoms, intensive_variables=[\"aqi\"])\n", + "geoms[\"air_quality_index\"] = interp.aqi.values" + ] + }, + { + "cell_type": "markdown", + "id": "ee2270ca", + "metadata": {}, + "source": [ + " Interpolate OA data" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "7d9f9813", + "metadata": {}, + "outputs": [], + "source": [ + "interp_oa = tobler.area_weighted.area_interpolate(\n", + " pop_hp,\n", + " geoms,\n", + " intensive_variables=[\"priceper\"],\n", + " extensive_variables=[\"population\"],\n", + ")\n", + "geoms[\"house_price_index\"] = interp_oa.priceper.values\n", + "geoms[\"population\"] = interp_oa.population.values" + ] + }, + { + "cell_type": "markdown", + "id": "9b3aed63", + "metadata": {}, + "source": [ + "Interpolate workplace population" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "096d4c43", + "metadata": {}, + "outputs": [], + "source": [ + "wp_interpolated = tobler.area_weighted.area_interpolate(\n", + " wp_aoi,\n", + " geoms,\n", + " extensive_variables=[\n", + " \"A, B, D, E. Agriculture, energy and water\",\n", + " \"C. Manufacturing\",\n", + " \"F. Construction\",\n", + " \"G, I. Distribution, hotels and restaurants\",\n", + " \"H, J. Transport and communication\",\n", + " \"K, L, M, N. Financial, real estate, professional and administrative activities\",\n", + " \"O,P,Q. Public administration, education and health\",\n", + " \"R, S, T, U. Other\",\n", + " ],\n", + ")\n", + "\n", + "geoms[\n", + " [\n", + " \"A, B, D, E. Agriculture, energy and water\",\n", + " \"C. Manufacturing\",\n", + " \"F. Construction\",\n", + " \"G, I. Distribution, hotels and restaurants\",\n", + " \"H, J. Transport and communication\",\n", + " \"K, L, M, N. Financial, real estate, professional and administrative activities\",\n", + " \"O,P,Q. Public administration, education and health\",\n", + " \"R, S, T, U. Other\",\n", + " ]\n", + "] = wp_interpolated.drop(columns=\"geometry\").values" + ] + }, + { + "cell_type": "markdown", + "id": "2d5a4c04", + "metadata": {}, + "source": [ + "Interpolate CORINE" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "03619be9", + "metadata": {}, + "outputs": [], + "source": [ + "corine_interpolated = tobler.area_weighted.area_interpolate(\n", + " corine_aoi, geoms, categorical_variables=[\"Code_18\"]\n", + ")\n", + "corine_interpolated.columns = corine_interpolated.columns.map(corine_names)\n", + "interesting = [\n", + " \"Land cover [Discontinuous urban fabric]\",\n", + " \"Land cover [Continuous urban fabric]\",\n", + " \"Land cover [Non-irrigated arable land]\",\n", + " \"Land cover [Industrial or commercial units]\",\n", + " \"Land cover [Green urban areas]\",\n", + " \"Land cover [Pastures]\",\n", + " \"Land cover [Sport and leisure facilities]\",\n", + "]\n", + "geoms[interesting] = corine_interpolated[interesting].values" + ] + }, + { + "cell_type": "markdown", + "id": "885747a8", + "metadata": {}, + "source": [ + "Interpolate morphometrics" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "3f51e66d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: sdbAre, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: sdbCoA, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: ssbCCo, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: ssbCor, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: ssbSqu, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: ssbERI, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: ssbCCM, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: ssbCCD, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: stbOri, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: sicCAR, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: stbCeA, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: mtbAli, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: mtbNDi, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: stbSAl, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n" + ] + } + ], + "source": [ + "chars = data.columns.drop(\n", + " [\n", + " \"hindex\",\n", + " \"tessellation\",\n", + " \"buildings\",\n", + " \"nodeID\",\n", + " \"edgeID_keys\",\n", + " \"edgeID_values\",\n", + " \"edgeID_primary\",\n", + " \"sdbPer\",\n", + " \"ssbElo\",\n", + " \"stcOri\",\n", + " \"sdcLAL\",\n", + " \"mdcAre\",\n", + " \"ltcAre\",\n", + " \"ltcWRE\",\n", + " \"mtdMDi\",\n", + " \"lcdMes\",\n", + " \"lddNDe\",\n", + " \"sddAre\",\n", + " \"mdsAre\",\n", + " \"ldsAre\",\n", + " \"lisCel\",\n", + " \"ldePer\",\n", + " \"lseCWA\",\n", + " ]\n", + ")\n", + "morhp_interpolated = tobler.area_weighted.area_interpolate(\n", + " data, geoms, intensive_variables=chars.tolist()\n", + ")\n", + "\n", + "geoms[morhp_interpolated.columns.drop(\"geometry\")] = morhp_interpolated.drop(\n", + " columns=\"geometry\"\n", + ").values" + ] + }, + { + "cell_type": "markdown", + "id": "9de40656", + "metadata": {}, + "source": [ + "Get split IDs" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "9b215bd2", + "metadata": {}, + "outputs": [], + "source": [ + "geoms_ix, poll_ix = pollutants_aoi_df.sindex.query(geoms.centroid, predicate=\"within\", sort=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "d09d7a1d", + "metadata": {}, + "outputs": [], + "source": [ + "geoms[\"split\"] = pollutants_aoi_df[\"split\"].values[poll_ix]" + ] + }, + { + "cell_type": "markdown", + "id": "a1448870", + "metadata": {}, + "source": [ + "Save the table" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "1ef1845e", + "metadata": {}, + "outputs": [], + "source": [ + "geoms.to_parquet(f\"{data_folder}/unit_test/tables/square_{opt}.pq\")" + ] + }, + { + "cell_type": "markdown", + "id": "8e0dd487", + "metadata": {}, + "source": [ + "Loop over splits, get W, train, eval, save results." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "08db1967", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: \n", + " There are 14 disconnected components.\n", + " warnings.warn(message)\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/geopandas/geodataframe.py:1543: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " super().__setitem__(key, value)\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: \n", + " There are 14 disconnected components.\n", + " warnings.warn(message)\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: \n", + " There are 12 disconnected components.\n", + " warnings.warn(message)\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: \n", + " There are 10 disconnected components.\n", + " warnings.warn(message)\n" + ] + } + ], + "source": [ + "fit_and_eval(geoms, opt, \"square\")" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "d786e74d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'leeds': {'h3': {'air': {'loop_0': {'mse': 1.0215962668214376,\n", + " 'me': 0.7268292438504929,\n", + " 'r2': 0.6471124647540352,\n", + " 'moran_obs': 0.9139658896608336,\n", + " 'moran_pred': 0.8911782193252566},\n", + " 'loop_1': {'mse': 0.9052719580939292,\n", + " 'me': 0.7173481454198483,\n", + " 'r2': 0.7214704719652031,\n", + " 'moran_obs': 0.9382819221632059,\n", + " 'moran_pred': 0.9108587337675578},\n", + " 'loop_2': {'mse': 1.189713977353201,\n", + " 'me': 0.7906349848032027,\n", + " 'r2': 0.6480546399551688,\n", + " 'moran_obs': 0.9270422555656327,\n", + " 'moran_pred': 0.9186198786505408},\n", + " 'loop_3': {'mse': 0.9411979683024067,\n", + " 'me': 0.7241649062582691,\n", + " 'r2': 0.7489344824312717,\n", + " 'moran_obs': 0.9386609332461856,\n", + " 'moran_pred': 0.9026754574770531}},\n", + " 'hp': {'loop_0': {'mse': 45.97914682149453,\n", + " 'me': 6.607405179022031,\n", + " 'r2': -818.245515704516,\n", + " 'moran_obs': 0.7485176481450501,\n", + " 'moran_pred': 0.8911782193252566},\n", + " 'loop_1': {'mse': 48.6391071221501,\n", + " 'me': 6.802131639333218,\n", + " 'r2': -784.5792801815664,\n", + " 'moran_obs': 0.7380520767953878,\n", + " 'moran_pred': 0.9108587337675578},\n", + " 'loop_2': {'mse': 48.506456511887656,\n", + " 'me': 6.763358828565924,\n", + " 'r2': -899.4290133356027,\n", + " 'moran_obs': 0.7517514698021434,\n", + " 'moran_pred': 0.9186198786505408},\n", + " 'loop_3': {'mse': 49.32745556540463,\n", + " 'me': 6.814134839974605,\n", + " 'r2': -890.4742353349244,\n", + " 'moran_obs': 0.7737351301281735,\n", + " 'moran_pred': 0.9026754574770531}}},\n", + " 'square': {'air': {'loop_0': {'mse': 1.0420599773442174,\n", + " 'me': 0.7380996956699067,\n", + " 'r2': 0.646438913789853,\n", + " 'moran_obs': 0.9111140022891392,\n", + " 'moran_pred': 0.9027042486979502},\n", + " 'loop_1': {'mse': 0.9235654806637308,\n", + " 'me': 0.7061784534530474,\n", + " 'r2': 0.7193292165390978,\n", + " 'moran_obs': 0.9368653661430093,\n", + " 'moran_pred': 0.922064675591301},\n", + " 'loop_2': {'mse': 1.13918100099561,\n", + " 'me': 0.7919036088455168,\n", + " 'r2': 0.666932509493775,\n", + " 'moran_obs': 0.9239352070372084,\n", + " 'moran_pred': 0.9195815458495651},\n", + " 'loop_3': {'mse': 1.0725950060473017,\n", + " 'me': 0.7528672498496167,\n", + " 'r2': 0.7103765505241364,\n", + " 'moran_obs': 0.9349875493670573,\n", + " 'moran_pred': 0.8896485193646833}},\n", + " 'hp': {'loop_0': {'mse': 46.3854910205935,\n", + " 'me': 6.638112942385972,\n", + " 'r2': -800.8419912402562,\n", + " 'moran_obs': 0.7339407376469802,\n", + " 'moran_pred': 0.9027042486979502},\n", + " 'loop_1': {'mse': 48.173346137871945,\n", + " 'me': 6.765422047470163,\n", + " 'r2': -740.0298818381401,\n", + " 'moran_obs': 0.7344662884119277,\n", + " 'moran_pred': 0.922064675591301},\n", + " 'loop_2': {'mse': 48.43479838576467,\n", + " 'me': 6.763522302146255,\n", + " 'r2': -882.9217979601364,\n", + " 'moran_obs': 0.7324712040592452,\n", + " 'moran_pred': 0.9195815458495651},\n", + " 'loop_3': {'mse': 48.29887582791923,\n", + " 'me': 6.7551295495177355,\n", + " 'r2': -862.8450456628527,\n", + " 'moran_obs': 0.7676786336335174,\n", + " 'moran_pred': 0.8896485193646833}}}}}" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta" + ] + }, + { + "cell_type": "markdown", + "id": "207a925b", + "metadata": {}, + "source": [ + "### output area\n", + "Create geometries" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "00cf7e9c", + "metadata": {}, + "outputs": [], + "source": [ + "meta[\"leeds\"][\"oa\"] = {}" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "3868348d", + "metadata": {}, + "outputs": [], + "source": [ + "geoms = pop_hp.set_index(\"code\")[[\"geometry\", \"priceper\", \"population\"]].set_crs(27700, allow_override=True)" + ] + }, + { + "cell_type": "markdown", + "id": "cca55a2e", + "metadata": {}, + "source": [ + "Interpolate Air Quality" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "fe2979c0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n", + " return lib.intersection(a, b, **kwargs)\n" + ] + } + ], + "source": [ + "interp = tobler.area_weighted.area_interpolate(pollutants_aoi_df, geoms, intensive_variables=[\"aqi\"])\n", + "geoms[\"air_quality_index\"] = interp.aqi.values" + ] + }, + { + "cell_type": "markdown", + "id": "ad9abd9f", + "metadata": {}, + "source": [ + " Rename OA data" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "31fde119", + "metadata": {}, + "outputs": [], + "source": [ + "geoms = geoms.rename(columns={\"priceper\": \"house_price_index\"})" + ] + }, + { + "cell_type": "markdown", + "id": "dd1c4576", + "metadata": {}, + "source": [ + "Interpolate workplace population" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "ee0792f0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n", + " return lib.intersection(a, b, **kwargs)\n" + ] + } + ], + "source": [ + "wp_interpolated = tobler.area_weighted.area_interpolate(\n", + " wp_aoi,\n", + " geoms,\n", + " extensive_variables=[\n", + " \"A, B, D, E. Agriculture, energy and water\",\n", + " \"C. Manufacturing\",\n", + " \"F. Construction\",\n", + " \"G, I. Distribution, hotels and restaurants\",\n", + " \"H, J. Transport and communication\",\n", + " \"K, L, M, N. Financial, real estate, professional and administrative activities\",\n", + " \"O,P,Q. Public administration, education and health\",\n", + " \"R, S, T, U. Other\",\n", + " ],\n", + ")\n", + "\n", + "geoms[\n", + " [\n", + " \"A, B, D, E. Agriculture, energy and water\",\n", + " \"C. Manufacturing\",\n", + " \"F. Construction\",\n", + " \"G, I. Distribution, hotels and restaurants\",\n", + " \"H, J. Transport and communication\",\n", + " \"K, L, M, N. Financial, real estate, professional and administrative activities\",\n", + " \"O,P,Q. Public administration, education and health\",\n", + " \"R, S, T, U. Other\",\n", + " ]\n", + "] = wp_interpolated.drop(columns=\"geometry\").values" + ] + }, + { + "cell_type": "markdown", + "id": "74b0ea5c", + "metadata": {}, + "source": [ + "Interpolate CORINE" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "b4bc8a13", + "metadata": {}, + "outputs": [], + "source": [ + "corine_interpolated = tobler.area_weighted.area_interpolate(\n", + " corine_aoi, geoms, categorical_variables=[\"Code_18\"]\n", + ")\n", + "corine_interpolated.columns = corine_interpolated.columns.map(corine_names)\n", + "interesting = [\n", + " \"Land cover [Discontinuous urban fabric]\",\n", + " \"Land cover [Continuous urban fabric]\",\n", + " \"Land cover [Non-irrigated arable land]\",\n", + " \"Land cover [Industrial or commercial units]\",\n", + " \"Land cover [Green urban areas]\",\n", + " \"Land cover [Pastures]\",\n", + " \"Land cover [Sport and leisure facilities]\",\n", + "]\n", + "geoms[interesting] = corine_interpolated[interesting].values" + ] + }, + { + "cell_type": "markdown", + "id": "17826424", + "metadata": {}, + "source": [ + "Interpolate morphometrics" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "97fd4194", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n", + " return lib.intersection(a, b, **kwargs)\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: sdbAre, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: sdbCoA, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: ssbCCo, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: ssbCor, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: ssbSqu, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: ssbERI, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: ssbCCM, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: ssbCCD, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: stbOri, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: sicCAR, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: stbCeA, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: mtbAli, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: mtbNDi, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/tobler/util/util.py:53: UserWarning: nan values in variable: stbSAl, replacing with 0\n", + " warn(f\"nan values in variable: {column}, replacing with 0\")\n" + ] + } + ], + "source": [ + "chars = data.columns.drop(\n", + " [\n", + " \"hindex\",\n", + " \"tessellation\",\n", + " \"buildings\",\n", + " \"nodeID\",\n", + " \"edgeID_keys\",\n", + " \"edgeID_values\",\n", + " \"edgeID_primary\",\n", + " \"sdbPer\",\n", + " \"ssbElo\",\n", + " \"stcOri\",\n", + " \"sdcLAL\",\n", + " \"mdcAre\",\n", + " \"ltcAre\",\n", + " \"ltcWRE\",\n", + " \"mtdMDi\",\n", + " \"lcdMes\",\n", + " \"lddNDe\",\n", + " \"sddAre\",\n", + " \"mdsAre\",\n", + " \"ldsAre\",\n", + " \"lisCel\",\n", + " \"ldePer\",\n", + " \"lseCWA\",\n", + " ]\n", + ")\n", + "morhp_interpolated = tobler.area_weighted.area_interpolate(\n", + " data, geoms, intensive_variables=chars.tolist()\n", + ")\n", + "\n", + "geoms[morhp_interpolated.columns.drop(\"geometry\")] = morhp_interpolated.drop(\n", + " columns=\"geometry\"\n", + ").values" + ] + }, + { + "cell_type": "markdown", + "id": "58ed4aac", + "metadata": {}, + "source": [ + "Get split IDs" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "8689ee3c", + "metadata": {}, + "outputs": [], + "source": [ + "geoms_ix, poll_ix = pollutants_aoi_df.sindex.query(geoms.centroid, predicate=\"within\", sort=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "3cc92ac8", + "metadata": {}, + "outputs": [], + "source": [ + "geoms.loc[geoms.index[geoms_ix], \"split\"] = pollutants_aoi_df[\"split\"].values[poll_ix]" + ] + }, + { + "cell_type": "markdown", + "id": "14971c0e", + "metadata": {}, + "source": [ + "Save the table" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "71db495c", + "metadata": {}, + "outputs": [], + "source": [ + "geoms.to_parquet(f\"{data_folder}/unit_test/tables/oa_{opt}.pq\")" + ] + }, + { + "cell_type": "markdown", + "id": "ca8ab6bb", + "metadata": {}, + "source": [ + "Loop over splits, get W, train, eval, save results." + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "30cd955e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/util.py:1665: FutureWarning: The `query_bulk()` method is deprecated and will be removed in GeoPandas 1.0. You can use the `query()` method instead.\n", + " inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/util.py:1665: FutureWarning: The `query_bulk()` method is deprecated and will be removed in GeoPandas 1.0. You can use the `query()` method instead.\n", + " inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: \n", + " There are 3 disconnected components.\n", + " There is 1 island with id: 723.\n", + " warnings.warn(message)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('WARNING: ', 723, ' is an island (no neighbors)')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/util.py:1665: FutureWarning: The `query_bulk()` method is deprecated and will be removed in GeoPandas 1.0. You can use the `query()` method instead.\n", + " inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/util.py:1665: FutureWarning: The `query_bulk()` method is deprecated and will be removed in GeoPandas 1.0. You can use the `query()` method instead.\n", + " inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: \n", + " There are 2 disconnected components.\n", + " warnings.warn(message)\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/util.py:1665: FutureWarning: The `query_bulk()` method is deprecated and will be removed in GeoPandas 1.0. You can use the `query()` method instead.\n", + " inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/util.py:1665: FutureWarning: The `query_bulk()` method is deprecated and will be removed in GeoPandas 1.0. You can use the `query()` method instead.\n", + " inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/util.py:1665: FutureWarning: The `query_bulk()` method is deprecated and will be removed in GeoPandas 1.0. You can use the `query()` method instead.\n", + " inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/util.py:1665: FutureWarning: The `query_bulk()` method is deprecated and will be removed in GeoPandas 1.0. You can use the `query()` method instead.\n", + " inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)\n", + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: \n", + " There are 2 disconnected components.\n", + " There is 1 island with id: 642.\n", + " warnings.warn(message)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('WARNING: ', 642, ' is an island (no neighbors)')\n" + ] + } + ], + "source": [ + "fit_and_eval(geoms, opt, \"oa\")" + ] + }, + { + "cell_type": "markdown", + "id": "87f149a5", + "metadata": {}, + "source": [ + "### ET cells\n", + "Create geometries" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "1305ea0c", + "metadata": {}, + "outputs": [], + "source": [ + "meta[\"leeds\"][\"et\"] = {}" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "d9b2a257", + "metadata": {}, + "outputs": [], + "source": [ + "geoms = data.set_index(\"hindex\").drop(columns=\n", + " [\n", + " \"buildings\",\n", + " \"nodeID\",\n", + " \"edgeID_keys\",\n", + " \"edgeID_values\",\n", + " \"edgeID_primary\",\n", + " \"sdbPer\",\n", + " \"ssbElo\",\n", + " \"stcOri\",\n", + " \"sdcLAL\",\n", + " \"mdcAre\",\n", + " \"ltcAre\",\n", + " \"ltcWRE\",\n", + " \"mtdMDi\",\n", + " \"lcdMes\",\n", + " \"lddNDe\",\n", + " \"sddAre\",\n", + " \"mdsAre\",\n", + " \"ldsAre\",\n", + " \"lisCel\",\n", + " \"ldePer\",\n", + " \"lseCWA\",\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ffb855b7", + "metadata": {}, + "source": [ + "Interpolate Air Quality" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "b2dad042", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n", + " return lib.intersection(a, b, **kwargs)\n" + ] + } + ], + "source": [ + "interp = tobler.area_weighted.area_interpolate(pollutants_aoi_df, geoms, intensive_variables=[\"aqi\"])\n", + "geoms[\"air_quality_index\"] = interp.aqi.values" + ] + }, + { + "cell_type": "markdown", + "id": "3d44ac2c", + "metadata": {}, + "source": [ + " Interpolate OA data" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "1a06fef3", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n", + " return lib.intersection(a, b, **kwargs)\n" + ] + } + ], + "source": [ + "interp_oa = tobler.area_weighted.area_interpolate(\n", + " pop_hp,\n", + " geoms,\n", + " intensive_variables=[\"priceper\"],\n", + " extensive_variables=[\"population\"],\n", + ")\n", + "geoms[\"house_price_index\"] = interp_oa.priceper.values\n", + "geoms[\"population\"] = interp_oa.population.values" + ] + }, + { + "cell_type": "markdown", + "id": "0d9a18a8", + "metadata": {}, + "source": [ + "Interpolate workplace population" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "6efb0fc9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/martin/mambaforge/envs/demoland_v2/lib/python3.11/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n", + " return lib.intersection(a, b, **kwargs)\n" + ] + } + ], + "source": [ + "wp_interpolated = tobler.area_weighted.area_interpolate(\n", + " wp_aoi,\n", + " geoms,\n", + " extensive_variables=[\n", + " \"A, B, D, E. Agriculture, energy and water\",\n", + " \"C. Manufacturing\",\n", + " \"F. Construction\",\n", + " \"G, I. Distribution, hotels and restaurants\",\n", + " \"H, J. Transport and communication\",\n", + " \"K, L, M, N. Financial, real estate, professional and administrative activities\",\n", + " \"O,P,Q. Public administration, education and health\",\n", + " \"R, S, T, U. Other\",\n", + " ],\n", + ")\n", + "\n", + "geoms[\n", + " [\n", + " \"A, B, D, E. Agriculture, energy and water\",\n", + " \"C. Manufacturing\",\n", + " \"F. Construction\",\n", + " \"G, I. Distribution, hotels and restaurants\",\n", + " \"H, J. Transport and communication\",\n", + " \"K, L, M, N. Financial, real estate, professional and administrative activities\",\n", + " \"O,P,Q. Public administration, education and health\",\n", + " \"R, S, T, U. Other\",\n", + " ]\n", + "] = wp_interpolated.drop(columns=\"geometry\").values" + ] + }, + { + "cell_type": "markdown", + "id": "294dbf9d", + "metadata": {}, + "source": [ + "Interpolate CORINE" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "9e2d3836", + "metadata": {}, + "outputs": [], + "source": [ + "corine_interpolated = tobler.area_weighted.area_interpolate(\n", + " corine_aoi, geoms, categorical_variables=[\"Code_18\"]\n", + ")\n", + "corine_interpolated.columns = corine_interpolated.columns.map(corine_names)\n", + "interesting = [\n", + " \"Land cover [Discontinuous urban fabric]\",\n", + " \"Land cover [Continuous urban fabric]\",\n", + " \"Land cover [Non-irrigated arable land]\",\n", + " \"Land cover [Industrial or commercial units]\",\n", + " \"Land cover [Green urban areas]\",\n", + " \"Land cover [Pastures]\",\n", + " \"Land cover [Sport and leisure facilities]\",\n", + "]\n", + "geoms[interesting] = corine_interpolated[interesting].values" + ] + }, + { + "cell_type": "markdown", + "id": "01520610", + "metadata": {}, + "source": [ + "Get split IDs" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "91127c4c", + "metadata": {}, + "outputs": [], + "source": [ + "geoms_ix, poll_ix = pollutants_aoi_df.sindex.query(geoms.centroid, predicate=\"within\", sort=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "132ca054", + "metadata": {}, + "outputs": [], + "source": [ + "geoms.loc[geoms.index[geoms_ix], \"split\"] = pollutants_aoi_df[\"split\"].values[poll_ix]" + ] + }, + { + "cell_type": "markdown", + "id": "b20e26cd", + "metadata": {}, + "source": [ + "Save the table" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "116ef151", + "metadata": {}, + "outputs": [], + "source": [ + "geoms.to_parquet(f\"{data_folder}/unit_test/tables/et_{opt}.pq\")" + ] + }, + { + "cell_type": "markdown", + "id": "b116988c", + "metadata": {}, + "source": [ + "Loop over splits, get W, train, eval, save results." + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "b5910cc6", + "metadata": {}, + "outputs": [], + "source": [ + "fit_and_eval(geoms, opt, \"et\")" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "2c815b79", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'leeds': {'h3': {'air': {'loop_0': {'mse': 1.0215962668214376,\n", + " 'me': 0.7268292438504929,\n", + " 'r2': 0.6471124647540352,\n", + " 'moran_obs': 0.9139658896608336,\n", + " 'moran_pred': 0.8911782193252566},\n", + " 'loop_1': {'mse': 0.9052719580939292,\n", + " 'me': 0.7173481454198483,\n", + " 'r2': 0.7214704719652031,\n", + " 'moran_obs': 0.9382819221632059,\n", + " 'moran_pred': 0.9108587337675578},\n", + " 'loop_2': {'mse': 1.189713977353201,\n", + " 'me': 0.7906349848032027,\n", + " 'r2': 0.6480546399551688,\n", + " 'moran_obs': 0.9270422555656327,\n", + " 'moran_pred': 0.9186198786505408},\n", + " 'loop_3': {'mse': 0.9411979683024067,\n", + " 'me': 0.7241649062582691,\n", + " 'r2': 0.7489344824312717,\n", + " 'moran_obs': 0.9386609332461856,\n", + " 'moran_pred': 0.9026754574770531}},\n", + " 'hp': {'loop_0': {'mse': 45.97914682149453,\n", + " 'me': 6.607405179022031,\n", + " 'r2': -818.245515704516,\n", + " 'moran_obs': 0.7485176481450501,\n", + " 'moran_pred': 0.8911782193252566},\n", + " 'loop_1': {'mse': 48.6391071221501,\n", + " 'me': 6.802131639333218,\n", + " 'r2': -784.5792801815664,\n", + " 'moran_obs': 0.7380520767953878,\n", + " 'moran_pred': 0.9108587337675578},\n", + " 'loop_2': {'mse': 48.506456511887656,\n", + " 'me': 6.763358828565924,\n", + " 'r2': -899.4290133356027,\n", + " 'moran_obs': 0.7517514698021434,\n", + " 'moran_pred': 0.9186198786505408},\n", + " 'loop_3': {'mse': 49.32745556540463,\n", + " 'me': 6.814134839974605,\n", + " 'r2': -890.4742353349244,\n", + " 'moran_obs': 0.7737351301281735,\n", + " 'moran_pred': 0.9026754574770531}}},\n", + " 'square': {'air': {'loop_0': {'mse': 1.0420599773442174,\n", + " 'me': 0.7380996956699067,\n", + " 'r2': 0.646438913789853,\n", + " 'moran_obs': 0.9111140022891392,\n", + " 'moran_pred': 0.9027042486979502},\n", + " 'loop_1': {'mse': 0.9235654806637308,\n", + " 'me': 0.7061784534530474,\n", + " 'r2': 0.7193292165390978,\n", + " 'moran_obs': 0.9368653661430093,\n", + " 'moran_pred': 0.922064675591301},\n", + " 'loop_2': {'mse': 1.13918100099561,\n", + " 'me': 0.7919036088455168,\n", + " 'r2': 0.666932509493775,\n", + " 'moran_obs': 0.9239352070372084,\n", + " 'moran_pred': 0.9195815458495651},\n", + " 'loop_3': {'mse': 1.0725950060473017,\n", + " 'me': 0.7528672498496167,\n", + " 'r2': 0.7103765505241364,\n", + " 'moran_obs': 0.9349875493670573,\n", + " 'moran_pred': 0.8896485193646833}},\n", + " 'hp': {'loop_0': {'mse': 46.3854910205935,\n", + " 'me': 6.638112942385972,\n", + " 'r2': -800.8419912402562,\n", + " 'moran_obs': 0.7339407376469802,\n", + " 'moran_pred': 0.9027042486979502},\n", + " 'loop_1': {'mse': 48.173346137871945,\n", + " 'me': 6.765422047470163,\n", + " 'r2': -740.0298818381401,\n", + " 'moran_obs': 0.7344662884119277,\n", + " 'moran_pred': 0.922064675591301},\n", + " 'loop_2': {'mse': 48.43479838576467,\n", + " 'me': 6.763522302146255,\n", + " 'r2': -882.9217979601364,\n", + " 'moran_obs': 0.7324712040592452,\n", + " 'moran_pred': 0.9195815458495651},\n", + " 'loop_3': {'mse': 48.29887582791923,\n", + " 'me': 6.7551295495177355,\n", + " 'r2': -862.8450456628527,\n", + " 'moran_obs': 0.7676786336335174,\n", + " 'moran_pred': 0.8896485193646833}}},\n", + " 'oa': {'air': {'loop_0': {'mse': 1.6406859973050265,\n", + " 'me': 0.9764654396881765,\n", + " 'r2': 0.49909764936145573,\n", + " 'moran_obs': 0.7427709726203485,\n", + " 'moran_pred': 0.6117445375339965},\n", + " 'loop_1': {'mse': 2.094601889477692,\n", + " 'me': 1.1442133479038568,\n", + " 'r2': 0.547215274825586,\n", + " 'moran_obs': 0.8332359549961861,\n", + " 'moran_pred': 0.7032684724122078},\n", + " 'loop_2': {'mse': 1.457058801252724,\n", + " 'me': 0.9571080156615993,\n", + " 'r2': 0.6991834779622943,\n", + " 'moran_obs': 0.8143144952561058,\n", + " 'moran_pred': 0.7847805519464338},\n", + " 'loop_3': {'mse': 2.2501100857496943,\n", + " 'me': 1.1465556558100203,\n", + " 'r2': 0.45962392436854094,\n", + " 'moran_obs': 0.7961674218568463,\n", + " 'moran_pred': 0.7035988962302134}},\n", + " 'hp': {'loop_0': {'mse': 65.02165151682122,\n", + " 'me': 7.944818822928156,\n", + " 'r2': -418.9323190744034,\n", + " 'moran_obs': 0.5105048506497427,\n", + " 'moran_pred': 0.6117445375339965},\n", + " 'loop_1': {'mse': 72.36258290839797,\n", + " 'me': 8.314634843264253,\n", + " 'r2': -410.4138021886665,\n", + " 'moran_obs': 0.55139511347767,\n", + " 'moran_pred': 0.7032684724122078},\n", + " 'loop_2': {'mse': 80.35702500510841,\n", + " 'me': 8.681044693045214,\n", + " 'r2': -498.1986018959034,\n", + " 'moran_obs': 0.5452515217928825,\n", + " 'moran_pred': 0.7847805519464338},\n", + " 'loop_3': {'mse': 70.79038550161015,\n", + " 'me': 8.214779601265105,\n", + " 'r2': -416.93928418561677,\n", + " 'moran_obs': 0.5588835550875523,\n", + " 'moran_pred': 0.7035988962302134}}},\n", + " 'et': {}}}" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0af66730", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}