Make notebook with plots for columns (#152)

* lower and upper more consistently * one more * handle bounds/bins/counts the same way * lots of reactive dicts, but the UI has not changed * data dump on the results page * add a pragma: no cover * reset widget values after checkbox change * do not clean up values * put tooltips in labels * pull warning up to analysis panel. TODO: conditional * move warning to bottom of list * analysis definition JSON * stubs for python * stub a script on results page * include column info in generated script * closer to a runable notebook * stuck on split_by_weight... maybe a library bug? * margin stubs * format python identifiers correctly * script has gotten longer: does not make sense to check for exact equality * fix syntactic problems in generated code * fill in columns, but still WIP * fix column names; tests pass * move confidence * simplify download panel * add markdown cells * tidy up * fix copy-paste of util functions * sort the intervals
opendp · Nov 18, 2024 · 3b89001 · 3b89001
1 parent 4f5b03d
commit 3b89001
Show file tree

Hide file tree

Showing 16 changed files with 290 additions and 163 deletions.
diff --git a/WHAT-WE-LEARNED.md b/WHAT-WE-LEARNED.md
@@ -46,6 +46,8 @@ but that returns an error:
 Renderer.__call__() missing 1 required positional argument: '_fn'
 ```
 
+If I just refer to a reactive calc directly in the UI there is no error in the log, just a spinner in the UI.
+
 ## No component testing
 
 It feels like a gap in the library that there is no component testing. The only advice is to pull out testable logic from the server functions, and for the rest, use end-to-end tests: There's not a recommended way to test the ui+server interaction for just one component.

diff --git a/dp_wizard/app/components/column_module.py b/dp_wizard/app/components/column_module.py
@@ -3,7 +3,7 @@
 from shiny import ui, render, module, reactive
 
 from dp_wizard.utils.dp_helper import make_confidence_accuracy_histogram
-from dp_wizard.app.components.plots import plot_histogram
+from dp_wizard.utils.shared import plot_histogram
 from dp_wizard.utils.templates import make_column_config_block
 from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip
 

diff --git a/dp_wizard/app/components/plots.py b/dp_wizard/app/components/plots.py
diff --git a/dp_wizard/app/results_panel.py b/dp_wizard/app/results_panel.py
@@ -1,6 +1,6 @@
 from json import dumps
 
-from shiny import ui, render
+from shiny import ui, render, reactive
 
 from dp_wizard.utils.templates import make_notebook_py, make_script_py
 from dp_wizard.utils.converters import convert_py_to_nb
@@ -9,24 +9,13 @@
 def results_ui():
     return ui.nav_panel(
         "Download results",
-        ui.p("TODO: Use this information to fill in a template!"),
-        ui.output_code("data_dump"),
-        ui.markdown(
-            "You can now make a differentially private release of your data. "
-            "This will lock the configuration you’ve provided on the previous pages."
-        ),
-        ui.markdown("TODO: Button: “Download Report (.txt)” (implemented as yaml?)"),
-        ui.markdown("TODO: Button: “Download Report (.csv)"),
-        ui.markdown(
-            "You can also download code that can be executed to produce a DP release. "
-            "Downloaded code does not lock the configuration."
-        ),
+        ui.markdown("You can now make a differentially private release of your data."),
         ui.download_button(
             "download_script",
             "Download Script (.py)",
         ),
         ui.download_button(
-            "download_notebook_unexecuted",
+            "download_notebook",
             "Download Notebook (.ipynb)",
         ),
         value="results_panel",
@@ -45,61 +34,78 @@ def results_server(
     weights,
     epsilon,
 ):  # pragma: no cover
-    @render.code
-    def data_dump():
-        # TODO: Use this information in a template!
+    @reactive.calc
+    def analysis_dict():
+        # weights().keys() will reflect the desired columns:
+        # The others retain inactive columns, so user
+        # inputs aren't lost when toggling checkboxes.
+        columns = {
+            col: {
+                "lower_bound": lower_bounds()[col],
+                "upper_bound": upper_bounds()[col],
+                "bin_count": int(bin_counts()[col]),
+                # TODO: Floats should work for weight, but they don't:
+                # https://github.com/opendp/opendp/issues/2140
+                "weight": int(weights()[col]),
+            }
+            for col in weights().keys()
+        }
+        return {
+            "csv_path": csv_path(),
+            "contributions": contributions(),
+            "epsilon": epsilon(),
+            "columns": columns,
+        }
+
+    @reactive.calc
+    def analysis_json():
         return dumps(
-            {
-                "csv_path": csv_path(),
-                "contributions": contributions(),
-                "lower_bounds": lower_bounds(),
-                "upper_bounds": upper_bounds(),
-                "bin_counts": bin_counts(),
-                "weights": weights(),
-                "epsilon": epsilon(),
-            },
+            analysis_dict(),
             indent=2,
         )
 
+    @render.text
+    def analysis_json_text():
+        return analysis_json()
+
+    @reactive.calc
+    def analysis_python():
+        analysis = analysis_dict()
+        return make_notebook_py(
+            csv_path=analysis["csv_path"],
+            contributions=analysis["contributions"],
+            epsilon=analysis["epsilon"],
+            columns=analysis["columns"],
+        )
+
+    @render.text
+    def analysis_python_text():
+        return analysis_python()
+
     @render.download(
         filename="dp-wizard-script.py",
         media_type="text/x-python",
     )
     async def download_script():
-        contributions = input.contributions()
+        analysis = analysis_dict()
         script_py = make_script_py(
-            contributions=contributions,
-            epsilon=1,
-            weights=[1],
+            contributions=analysis["contributions"],
+            epsilon=analysis["epsilon"],
+            columns=analysis["columns"],
         )
         yield script_py
 
     @render.download(
         filename="dp-wizard-notebook.ipynb",
         media_type="application/x-ipynb+json",
     )
-    async def download_notebook_unexecuted():
-        contributions = input.contributions()
-        notebook_py = make_notebook_py(
-            csv_path="todo.csv",
-            contributions=contributions,
-            epsilon=1,
-            weights=[1],
-        )
-        notebook_nb = convert_py_to_nb(notebook_py)
-        yield notebook_nb
-
-    @render.download(
-        filename="dp-wizard-notebook-executed.ipynb",
-        media_type="application/x-ipynb+json",
-    )
-    async def download_notebook_executed():
-        contributions = input.contributions()
+    async def download_notebook():
+        analysis = analysis_dict()
         notebook_py = make_notebook_py(
-            csv_path="todo.csv",
-            contributions=contributions,
-            epsilon=1,
-            weights=[1],
+            csv_path=analysis["csv_path"],
+            contributions=analysis["contributions"],
+            epsilon=analysis["epsilon"],
+            columns=analysis["columns"],
         )
         notebook_nb = convert_py_to_nb(notebook_py, execute=True)
         yield notebook_nb
diff --git a/dp_wizard/utils/csv_helper.py b/dp_wizard/utils/csv_helper.py
@@ -2,9 +2,11 @@
 We'll use the following terms consistently throughout the application:
 - name: This is the exact column header in the CSV.
 - label: This is the string we'll display.
-- id: This is the string we'll pass as a module ID.
+- id: This is the opaque string we'll pass as a module ID.
+- identifier: This is a form that can be used as a Python identifier.
 """
 
+import re
 import polars as pl
 
 
@@ -34,3 +36,7 @@ def name_to_id(name):
     # Shiny is fussy about module IDs,
     # but we don't need them to be human readable.
     return str(hash(name)).replace("-", "_")
+
+
+def name_to_identifier(name):
+    return re.sub(r"\W+", "_", name).lower()
diff --git a/dp_wizard/utils/dp_helper.py b/dp_wizard/utils/dp_helper.py
@@ -2,28 +2,11 @@
 import opendp.prelude as dp
 
 from dp_wizard.utils.mock_data import mock_data, ColumnDef
+from dp_wizard.utils.shared import make_cut_points
 
 dp.enable_features("contrib")
 
 
-def _make_cut_points(lower, upper, bin_count):
-    """
-    Returns one more cut point than the bin_count.
-    (There are actually two more bins, extending to
-    -inf and +inf, but we'll ignore those.)
-    Cut points are evenly spaced from lower to upper.
-
-    >>> _make_cut_points(0, 10, 1)
-    [0.0, 10.0]
-    >>> _make_cut_points(0, 10, 2)
-    [0.0, 5.0, 10.0]
-    >>> _make_cut_points(0, 10, 3)
-    [0.0, 3.33, 6.67, 10.0]
-    """
-    bin_width = (upper - lower) / bin_count
-    return [round(lower + i * bin_width, 2) for i in range(bin_count + 1)]
-
-
 def make_confidence_accuracy_histogram(
     lower=None, upper=None, bin_count=None, contributions=None, weighted_epsilon=None
 ):
@@ -58,7 +41,7 @@ def make_confidence_accuracy_histogram(
     # TODO: When this is stable, merge it to templates, so we can be
     # sure that we're using the same code in the preview that we
     # use in the generated notebook.
-    cut_points = _make_cut_points(lower, upper, bin_count)
+    cut_points = make_cut_points(lower, upper, bin_count)
     context = dp.Context.compositor(
         data=pl.LazyFrame(df).with_columns(
             # The cut() method returns a Polars categorical type.

diff --git a/dp_wizard/utils/shared.py b/dp_wizard/utils/shared.py
@@ -0,0 +1,57 @@
+# These functions are used both in the application and in generated notebooks.
+
+
+def make_cut_points(lower_bound, upper_bound, bin_count):
+    """
+    Returns one more cut point than the bin_count.
+    (There are actually two more bins, extending to
+    -inf and +inf, but we'll ignore those.)
+    Cut points are evenly spaced from lower_bound to upper_bound.
+    >>> make_cut_points(0, 10, 2)
+    [0.0, 5.0, 10.0]
+    """
+    bin_width = (upper_bound - lower_bound) / bin_count
+    return [round(lower_bound + i * bin_width, 2) for i in range(bin_count + 1)]
+
+
+def interval_bottom(interval):
+    """
+    >>> interval_bottom("(10, 20]")
+    10.0
+    """
+    return float(interval.split(",")[0][1:])
+
+
+def df_to_columns(df):
+    """
+    Transform a Dataframe into a format that is easier to plot,
+    parsing the interval strings to sort them as numbers.
+    >>> import polars as pl
+    >>> df = pl.DataFrame({
+    ...     "bin": ["(-inf, 5]", "(10, 20]", "(5, 10]"],
+    ...     "len": [0, 20, 10],
+    ... })
+    >>> df_to_columns(df)
+    (('(-inf, 5]', '(5, 10]', '(10, 20]'), (0, 10, 20))
+    """
+    sorted_rows = sorted(df.rows(), key=lambda pair: interval_bottom(pair[0]))
+    return tuple(zip(*sorted_rows))
+
+
+def plot_histogram(histogram_df, error, cutoff):  # pragma: no cover
+    """
+    Given a Dataframe for a histogram, plot the data.
+    """
+    import matplotlib.pyplot as plt
+
+    bins, values = df_to_columns(histogram_df)
+    mod = (len(bins) // 12) + 1
+    majors = [label for i, label in enumerate(bins) if i % mod == 0]
+    minors = [label for i, label in enumerate(bins) if i % mod != 0]
+    _figure, axes = plt.subplots()
+    bar_colors = ["blue" if v > cutoff else "lightblue" for v in values]
+    axes.bar(bins, values, color=bar_colors, yerr=error)
+    axes.set_xticks(majors, majors)
+    axes.set_xticks(minors, ["" for _ in minors], minor=True)
+    axes.axhline(cutoff, color="lightgrey", zorder=-1)
+    axes.set_ylim(bottom=0)