diff --git a/examples/2011-census-microdata/Example_Analysis.ipynb b/examples/2011-census-microdata/Example_Analysis.ipynb
new file mode 100644
index 0000000..b510d71
--- /dev/null
+++ b/examples/2011-census-microdata/Example_Analysis.ipynb
@@ -0,0 +1,1139 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "native-desktop",
+ "metadata": {},
+ "source": [
+ "## Example classification task"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "focused-november",
+ "metadata": {},
+ "source": [
+ "This is a (work-in-progress) example of assessing synthetic data by comparing the performance of classifiers.\n",
+ "\n",
+ "For a given method, two classifiers are trained: one each on the original and synthetic data. These are both tested using a hold-out set from the *original* dataset. Similar performance is the desired outcome.\n",
+ "\n",
+ "This can be repeated for other classification methods and tasks.\n",
+ "\n",
+ "An alternative assessment is to compare the ranked performance of various methods (ignoring their absolute performance)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "colored-clerk",
+ "metadata": {},
+ "source": [
+ "### Imports and setup"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "related-sharing",
+ "metadata": {},
+ "source": [
+ "The output from the `2011-census-test-*` examples must be present in the usual location (`../synth-output/`). "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "spare-joyce",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from collections import namedtuple\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import seaborn as sns\n",
+ "import matplotlib.ticker as ticker\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "import sklearn\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "import sklearn.metrics as metrics\n",
+ "from sklearn.metrics import precision_score, recall_score, roc_auc_score, classification_report"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "surrounded-interval",
+ "metadata": {},
+ "source": [
+ "### Task"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "published-diesel",
+ "metadata": {},
+ "source": [
+ "Predict an individual's marital status from their other demographics.\n",
+ "\n",
+ "Simplification: Predict whether an individual is single."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "juvenile-eight",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "features = [\"Sex\", \n",
+ " \"Age\",\n",
+ " \"Country of Birth\",\n",
+ " \"Health\",\n",
+ " \"Ethnic Group\",\n",
+ " \"Religion\",\n",
+ " \"Approximated Social Grade\",\n",
+ " \"Industry\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "twenty-nomination",
+ "metadata": {},
+ "source": [
+ "### Load data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "treated-cancer",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "orig = pd.read_csv(\"../../datasets/2011-census-microdata/2011-census-microdata.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "indie-korea",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# synthetic datasets ('rlsd' = released)\n",
+ "rlsd = {}\n",
+ "rlsd[\"synthpop 3\"] = pd.read_csv(\"../../synth-output/2011-census-test-3-synthpop/synthetic_data_1.csv\")\n",
+ "rlsd[\"synthpop 5\"] = pd.read_csv(\"../../synth-output/2011-census-test-5-synthpop-cart/synthetic_data_1.csv\")\n",
+ "rlsd[\"synthpop 10\"] = pd.read_csv(\"../../synth-output/2011-census-test-10-synthpop-cart-proper/synthetic_data_1.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "rapid-cradle",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Person ID | \n",
+ " Region | \n",
+ " Residence Type | \n",
+ " Family Composition | \n",
+ " Population Base | \n",
+ " Sex | \n",
+ " Age | \n",
+ " Marital Status | \n",
+ " Student | \n",
+ " Country of Birth | \n",
+ " Health | \n",
+ " Ethnic Group | \n",
+ " Religion | \n",
+ " Economic Activity | \n",
+ " Occupation | \n",
+ " Industry | \n",
+ " Hours worked per week | \n",
+ " Approximated Social Grade | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 7394816 | \n",
+ " E12000001 | \n",
+ " H | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 6 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " 8 | \n",
+ " 2 | \n",
+ " -9 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 7394745 | \n",
+ " E12000001 | \n",
+ " H | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 8 | \n",
+ " 6 | \n",
+ " 4 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 7395066 | \n",
+ " E12000001 | \n",
+ " H | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 6 | \n",
+ " 11 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 7395329 | \n",
+ " E12000001 | \n",
+ " H | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 7 | \n",
+ " 7 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 7394712 | \n",
+ " E12000001 | \n",
+ " H | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 5 | \n",
+ " 4 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 569736 | \n",
+ " 7946020 | \n",
+ " W92000004 | \n",
+ " H | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 8 | \n",
+ " 8 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 569737 | \n",
+ " 7944310 | \n",
+ " W92000004 | \n",
+ " H | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 7 | \n",
+ " 4 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 569738 | \n",
+ " 7945374 | \n",
+ " W92000004 | \n",
+ " H | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " -9 | \n",
+ " -9 | \n",
+ " -9 | \n",
+ " -9 | \n",
+ " -9 | \n",
+ "
\n",
+ " \n",
+ " 569739 | \n",
+ " 7944768 | \n",
+ " W92000004 | \n",
+ " H | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 8 | \n",
+ " 5 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 9 | \n",
+ " 5 | \n",
+ " 9 | \n",
+ " 2 | \n",
+ " -9 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 569740 | \n",
+ " 7944959 | \n",
+ " W92000004 | \n",
+ " H | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 7 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
569741 rows × 18 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Person ID Region Residence Type Family Composition \\\n",
+ "0 7394816 E12000001 H 2 \n",
+ "1 7394745 E12000001 H 5 \n",
+ "2 7395066 E12000001 H 3 \n",
+ "3 7395329 E12000001 H 3 \n",
+ "4 7394712 E12000001 H 3 \n",
+ "... ... ... ... ... \n",
+ "569736 7946020 W92000004 H 1 \n",
+ "569737 7944310 W92000004 H 3 \n",
+ "569738 7945374 W92000004 H 3 \n",
+ "569739 7944768 W92000004 H 1 \n",
+ "569740 7944959 W92000004 H 2 \n",
+ "\n",
+ " Population Base Sex Age Marital Status Student Country of Birth \\\n",
+ "0 1 2 6 2 2 1 \n",
+ "1 1 1 4 1 2 1 \n",
+ "2 1 2 4 1 2 1 \n",
+ "3 1 2 2 1 2 1 \n",
+ "4 1 1 5 4 2 1 \n",
+ "... ... ... ... ... ... ... \n",
+ "569736 1 1 5 1 2 1 \n",
+ "569737 1 1 3 1 2 1 \n",
+ "569738 1 1 1 1 1 1 \n",
+ "569739 1 2 8 5 2 1 \n",
+ "569740 1 2 2 2 2 1 \n",
+ "\n",
+ " Health Ethnic Group Religion Economic Activity Occupation \\\n",
+ "0 2 1 2 5 8 \n",
+ "1 1 1 2 1 8 \n",
+ "2 1 1 1 1 6 \n",
+ "3 2 1 2 1 7 \n",
+ "4 1 1 2 1 1 \n",
+ "... ... ... ... ... ... \n",
+ "569736 4 1 9 1 8 \n",
+ "569737 2 1 1 1 7 \n",
+ "569738 1 1 2 -9 -9 \n",
+ "569739 3 1 9 5 9 \n",
+ "569740 2 1 1 1 7 \n",
+ "\n",
+ " Industry Hours worked per week Approximated Social Grade \n",
+ "0 2 -9 4 \n",
+ "1 6 4 3 \n",
+ "2 11 3 4 \n",
+ "3 7 3 2 \n",
+ "4 4 3 2 \n",
+ "... ... ... ... \n",
+ "569736 8 3 3 \n",
+ "569737 4 3 4 \n",
+ "569738 -9 -9 -9 \n",
+ "569739 2 -9 4 \n",
+ "569740 4 1 4 \n",
+ "\n",
+ "[569741 rows x 18 columns]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "orig"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "royal-southeast",
+ "metadata": {},
+ "source": [
+ "### Basic quality checks"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "dying-louisiana",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# features to visualize\n",
+ "features_subset = [\"Age\", \"Marital Status\", \"Religion\", \"Occupation\"]\n",
+ "\n",
+ "n_plot_sample = 200000\n",
+ "\n",
+ "# Kludge: Some visualizations are more straightforward with continuous data, so convert the factors to float\n",
+ "\n",
+ "def recode(df):\n",
+ " return df.applymap(\n",
+ " lambda x: float(x) - 1 if int(x) != -9 else np.NAN\n",
+ " )\n",
+ "\n",
+ "\n",
+ "\n",
+ "def format_axis_ticks(grid):\n",
+ " for ax in grid.axes.flat:\n",
+ " if ax is not None:\n",
+ " xticks = ax.get_xticks()\n",
+ " xticks_new = [str(int(t + 1)) for t in xticks]\n",
+ "\n",
+ " yticks = ax.get_yticks()\n",
+ " yticks_new = [str(int(t + 1)) for t in yticks]\n",
+ " \n",
+ " ax.set_xticklabels(xticks_new)\n",
+ " ax.set_yticklabels(yticks_new)\n",
+ "\n",
+ " return grid\n",
+ "\n",
+ "def jitter(x):\n",
+ " return x + 0.3 * np.random.randn()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "abstract-charlotte",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/4t/76rq38xx7pn94vfcd3n5l2m0l8wv2w/T/ipykernel_10396/2062598768.py:25: UserWarning: FixedFormatter should only be used together with FixedLocator\n",
+ " ax.set_yticklabels(yticks_new)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "