opendp · mccalluc · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/demo-outputs.ipynb b/demo-outputs.ipynb
@@ -0,0 +1,364 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This will not run in isolation. I've cut the last cells out of the [demo.ipynb noteboook](https://github.com/opendp/dp-creator-ii/pull/48), and pasted them here, so the question of output formats can be considered separately. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'inputs': {'csv_path': '/tmp/demo.csv',\n",
+       "  'contributions': 10,\n",
+       "  'epsilon': 2,\n",
+       "  'weights': [4, 4, 1, 1],\n",
+       "  'max_possible_rows': 1000000,\n",
+       "  'delta': 1e-07,\n",
+       "  'grade': {'min': 50, 'max': 100, 'bins_count': 10},\n",
+       "  'class_year': {'min': 1, 'max': 4, 'bins_count': 4}},\n",
+       " 'outputs': {'grade': {'mean': 84.25140291806959,\n",
+       "   'histogram': {'(55, 60]': 24,\n",
+       "    '(60, 65]': 0,\n",
+       "    '(65, 70]': 28,\n",
+       "    '(70, 75]': 181,\n",
+       "    '(75, 80]': 227,\n",
+       "    '(80, 85]': 248,\n",
+       "    '(85, 90]': 204,\n",
+       "    '(90, 95]': 110,\n",
+       "    '(95, inf]': 0}},\n",
+       "  'class_year': {'mean': 1.8125701459034793,\n",
+       "   'histogram': {'(-inf, 1]': 420,\n",
+       "    '(1, 2]': 311,\n",
+       "    '(2, 3]': 80,\n",
+       "    '(3, inf]': 47}}}}"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "release = {\n",
+    "    'inputs': {\n",
+    "        'csv_path': csv_path,\n",
+    "        'contributions': contributions,\n",
+    "        'epsilon': epsilon,\n",
+    "        'weights': weights,\n",
+    "        'max_possible_rows': max_possible_rows,\n",
+    "        'delta': delta,\n",
+    "        'grade': {\n",
+    "            'min': grade_min,\n",
+    "            'max': grade_max,\n",
+    "            'bins_count': grade_bins_count,\n",
+    "        },\n",
+    "        'class_year': {\n",
+    "            'min': class_year_min,\n",
+    "            'max': class_year_max,\n",
+    "            'bins_count': class_year_bins_count,\n",
+    "        }    \n",
+    "    },\n",
+    "    'outputs': {\n",
+    "        'grade': {\n",
+    "            'mean': grade_mean.item(),\n",
+    "            'histogram': {v['grade_bin']: v['len'] for v in grade_histogram.to_dicts()}\n",
+    "        },\n",
+    "        'class_year': {\n",
+    "            'mean': class_year_mean.item(),\n",
+    "            'histogram': {v['class_year_bin']: v['len'] for v in class_year_histogram.to_dicts()}\n",
+    "        },\n",
+    "    }\n",
+    "}\n",
+    "release"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Text export?\n",
+    "\n",
+    "Just use YAML, unless there are other requirements?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "inputs:\n",
+      "  class_year:\n",
+      "    bins_count: 4\n",
+      "    max: 4\n",
+      "    min: 1\n",
+      "  contributions: 10\n",
+      "  csv_path: /tmp/demo.csv\n",
+      "  delta: 1.0e-07\n",
+      "  epsilon: 2\n",
+      "  grade:\n",
+      "    bins_count: 10\n",
+      "    max: 100\n",
+      "    min: 50\n",
+      "  max_possible_rows: 1000000\n",
+      "  weights:\n",
+      "  - 4\n",
+      "  - 4\n",
+      "  - 1\n",
+      "  - 1\n",
+      "outputs:\n",
+      "  class_year:\n",
+      "    histogram:\n",
+      "      (-inf, 1]: 420\n",
+      "      (1, 2]: 311\n",
+      "      (2, 3]: 80\n",
+      "      (3, inf]: 47\n",
+      "    mean: 1.8125701459034793\n",
+      "  grade:\n",
+      "    histogram:\n",
+      "      (55, 60]: 24\n",
+      "      (60, 65]: 0\n",
+      "      (65, 70]: 28\n",
+      "      (70, 75]: 181\n",
+      "      (75, 80]: 227\n",
+      "      (80, 85]: 248\n",
+      "      (85, 90]: 204\n",
+      "      (90, 95]: 110\n",
+      "      (95, inf]: 0\n",
+      "    mean: 84.25140291806959\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import yaml\n",
+    "\n",
+    "print(yaml.dump(release))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### CSV export?\n",
+    "\n",
+    "Flatten the data stucture to key value pairs and make a two-column CSV unless there are other requirements?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>inputs.csv_path</th>\n",
+       "      <td>/tmp/demo.csv</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>inputs.contributions</th>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>inputs.epsilon</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>inputs.weights</th>\n",
+       "      <td>[4, 4, 1, 1]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>inputs.max_possible_rows</th>\n",
+       "      <td>1000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>inputs.delta</th>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>inputs.grade.min</th>\n",
+       "      <td>50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>inputs.grade.max</th>\n",
+       "      <td>100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>inputs.grade.bins_count</th>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>inputs.class_year.min</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>inputs.class_year.max</th>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>inputs.class_year.bins_count</th>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>outputs.grade.mean</th>\n",
+       "      <td>84.251403</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>outputs.grade.histogram.(55, 60]</th>\n",
+       "      <td>24</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>outputs.grade.histogram.(60, 65]</th>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>outputs.grade.histogram.(65, 70]</th>\n",
+       "      <td>28</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>outputs.grade.histogram.(70, 75]</th>\n",
+       "      <td>181</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>outputs.grade.histogram.(75, 80]</th>\n",
+       "      <td>227</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>outputs.grade.histogram.(80, 85]</th>\n",
+       "      <td>248</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>outputs.grade.histogram.(85, 90]</th>\n",
+       "      <td>204</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>outputs.grade.histogram.(90, 95]</th>\n",
+       "      <td>110</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>outputs.grade.histogram.(95, inf]</th>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>outputs.class_year.mean</th>\n",
+       "      <td>1.81257</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>outputs.class_year.histogram.(-inf, 1]</th>\n",
+       "      <td>420</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>outputs.class_year.histogram.(1, 2]</th>\n",
+       "      <td>311</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>outputs.class_year.histogram.(2, 3]</th>\n",
+       "      <td>80</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>outputs.class_year.histogram.(3, inf]</th>\n",
+       "      <td>47</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                    0\n",
+       "inputs.csv_path                         /tmp/demo.csv\n",
+       "inputs.contributions                               10\n",
+       "inputs.epsilon                                      2\n",
+       "inputs.weights                           [4, 4, 1, 1]\n",
+       "inputs.max_possible_rows                      1000000\n",
+       "inputs.delta                                      0.0\n",
+       "inputs.grade.min                                   50\n",
+       "inputs.grade.max                                  100\n",
+       "inputs.grade.bins_count                            10\n",
+       "inputs.class_year.min                               1\n",
+       "inputs.class_year.max                               4\n",
+       "inputs.class_year.bins_count                        4\n",
+       "outputs.grade.mean                          84.251403\n",
+       "outputs.grade.histogram.(55, 60]                   24\n",
+       "outputs.grade.histogram.(60, 65]                    0\n",
+       "outputs.grade.histogram.(65, 70]                   28\n",
+       "outputs.grade.histogram.(70, 75]                  181\n",
+       "outputs.grade.histogram.(75, 80]                  227\n",
+       "outputs.grade.histogram.(80, 85]                  248\n",
+       "outputs.grade.histogram.(85, 90]                  204\n",
+       "outputs.grade.histogram.(90, 95]                  110\n",
+       "outputs.grade.histogram.(95, inf]                   0\n",
+       "outputs.class_year.mean                       1.81257\n",
+       "outputs.class_year.histogram.(-inf, 1]            420\n",
+       "outputs.class_year.histogram.(1, 2]               311\n",
+       "outputs.class_year.histogram.(2, 3]                80\n",
+       "outputs.class_year.histogram.(3, inf]              47"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from pandas import json_normalize\n",
+    "\n",
+    "json_normalize(release).transpose()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}