Column config UI, and template fixes (#87)

* reactive calc for column conf * column config; better templates * formatting * foldable code sample * get the name * Add underscores to emphasize that templates are private * cleaner template names * Move template.py to templates/ dir * move templates to lower dir, since I can not add a single file * add a privacy loss snippet * loss -> epsilon * move plot into column_module * dynamically recompute fake data * Bring over simple plot function, but do not use: expects different input * Lots of TODOs, but DP is connected to mock data * _df_to_dict doc test * much simpler! * better bins * better bins... but last commit was a regression * columns on chart match requested bins * round cutpoints * fix off-by-one * consolidate into doctest * pass name and contributions * update template to reflect new logic * calculated error bars * move DP details to helper function * Move DP details out * doctest of histogram * callbacks to get weights up to the top level * just three weight levels * weight default * when columns are removed, drop weight * avoid divide by zero * We dropped the top option * TODO is done * limit test * upload playwright artifact in CI * script to run tests with coverage * tracing=retain-on-failure, from playwright docs * ignore playwright report * Move Epsilon slider test above column test? * Pointers for trace.playwright * add note
opendp · Oct 31, 2024 · acb50ed · acb50ed
1 parent 1eb28db
commit acb50ed
Show file tree

Hide file tree

Showing 29 changed files with 506 additions and 248 deletions.
diff --git a/.flake8 b/.flake8
@@ -9,4 +9,4 @@ extend-ignore = E203,E501,E701
 
 per-file-ignores =
     # Ignore undefined names in templates.
-    */templates/*:F821,F401,E302
+    */templates/no-tests/*.py:F821,F401,E302
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -41,7 +41,10 @@ jobs:
         run: playwright install
 
       - name: Test
-        run: coverage run -m pytest -v
+        run: ./ci.sh
 
-      - name: Check coverage
-        run: coverage report
+      - uses: actions/upload-artifact@v4
+        if: ${{ !cancelled() }}
+        with:
+          name: playwright-traces
+          path: test-results/
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,9 @@
 # MacOS
 .DS_Store
 
+# Playwright
+test-results/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/.pytest.ini b/.pytest.ini
@@ -4,7 +4,7 @@
 filterwarnings =
 	error
 
-addopts = --doctest-glob '*.md' --doctest-modules --ignore dp_creator_ii/utils/templates/ --ignore dp_creator_ii/tests/fixtures/
+addopts = --doctest-glob '*.md' --doctest-modules --ignore dp_creator_ii/utils/templates/no-tests --ignore dp_creator_ii/tests/fixtures/ --tracing=retain-on-failure
 
 # If an xfail starts passing unexpectedly, that should count as a failure:
 xfail_strict=true
diff --git a/README.md b/README.md
@@ -51,8 +51,7 @@ Your browser should open and connect you to the application.
 
 Tests should pass, and code coverage should be complete (except blocks we explicitly ignore):
 ```shell
-$ coverage run -m pytest -v
-$ coverage report
+$ ./ci.sh
 ```
 
 We're using [Playwright](https://playwright.dev/python/) for end-to-end tests. You can use it to [generate test code](https://playwright.dev/python/docs/codegen-intro) just by interacting with the app in a browser:
@@ -63,9 +62,15 @@ $ playwright codegen http://127.0.0.1:8000/
 
 You can also [step through these tests](https://playwright.dev/python/docs/running-tests#debugging-tests) and see what the browser sees:
 ```shell
-$ PWDEBUG=1 pytest
+$ PWDEBUG=1 pytest -k test_app
 ```
 
+If Playwright fails in CI, we can still see what went wrong:
+- Scroll to the end of the CI log, to `actions/upload-artifact`.
+- Download the zipped artifact locally.
+- Inside the zipped artifact will be _another_ zip: `trace.zip`.
+- Don't unzip it! Instead, open it with [trace.playwright.dev](https://trace.playwright.dev/).
+
 ### Conventions
 
 Branch names should be of the form `NNNN-short-description`, where `NNNN` is the issue number being addressed.

diff --git a/ci.sh b/ci.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -euo pipefail
+
+coverage run -m pytest -v
+coverage report
diff --git a/dp_creator_ii/app/__init__.py b/dp_creator_ii/app/__init__.py
@@ -1,10 +1,13 @@
 from pathlib import Path
+import logging
 
 from shiny import App, ui
 
 from dp_creator_ii.app import analysis_panel, dataset_panel, results_panel
 
 
+logging.basicConfig(level=logging.INFO)
+
 app_ui = ui.page_bootstrap(
     ui.head_content(ui.include_css(Path(__file__).parent / "css" / "styles.css")),
     ui.navset_tab(

diff --git a/dp_creator_ii/app/analysis_panel.py b/dp_creator_ii/app/analysis_panel.py
@@ -2,12 +2,12 @@
 
 from shiny import ui, reactive, render
 
-from dp_creator_ii.utils.mock_data import mock_data, ColumnDef
-from dp_creator_ii.app.components.plots import plot_error_bars_with_cutoff
 from dp_creator_ii.app.components.inputs import log_slider
 from dp_creator_ii.app.components.column_module import column_ui, column_server
 from dp_creator_ii.utils.csv_helper import read_field_names
 from dp_creator_ii.utils.argparse_helpers import get_csv_contrib
+from dp_creator_ii.app.components.outputs import output_code_sample
+from dp_creator_ii.utils.templates import make_privacy_loss_block
 
 
 def analysis_ui():
@@ -28,36 +28,57 @@ def analysis_ui():
         ),
         log_slider("log_epsilon_slider", 0.1, 10.0),
         ui.output_text("epsilon"),
-        ui.markdown(
-            "## Preview\n"
-            "These plots assume a normal distribution for the columns you've selected, "
-            "and demonstrate the effect of different parameter choices."
-        ),
-        ui.output_plot("plot_preview"),
-        "(This plot is only to demonstrate that plotting works.)",
+        output_code_sample("Privacy Loss", "privacy_loss_python"),
         ui.input_action_button("go_to_results", "Download results"),
         value="analysis_panel",
     )
 
 
 def analysis_server(input, output, session):  # pragma: no cover
-    (csv_path, _contributions) = get_csv_contrib()
+    (csv_path, contributions) = get_csv_contrib()
 
     csv_path_from_cli_value = reactive.value(csv_path)
+    weights = reactive.value({})
+
+    def set_column_weight(column_id, weight):
+        weights.set({**weights(), column_id: weight})
+
+    def clear_column_weights(columns_ids_to_keep):
+        weights_copy = {**weights()}
+        column_ids_to_del = set(weights_copy.keys()) - set(columns_ids_to_keep)
+        for column_id in column_ids_to_del:
+            del weights_copy[column_id]
+        weights.set(weights_copy)
+
+    def get_weights_sum():
+        return sum(weights().values())
 
     @reactive.effect
-    def _():
+    def _update_checkbox_group():
         ui.update_checkbox_group(
             "columns_checkbox_group",
             label=None,
             choices=csv_fields_calc(),
         )
 
+    @reactive.effect
+    @reactive.event(input.columns_checkbox_group)
+    def _update_weights():
+        column_ids_to_keep = input.columns_checkbox_group()
+        clear_column_weights(column_ids_to_keep)
+
     @render.ui
     def columns_ui():
         column_ids = input.columns_checkbox_group()
         for column_id in column_ids:
-            column_server(column_id)
+            column_server(
+                column_id,
+                name=column_id,
+                contributions=contributions,
+                epsilon=epsilon_calc(),
+                set_column_weight=set_column_weight,
+                get_weights_sum=get_weights_sum,
+            )
         return [
             [
                 ui.h3(column_id),
@@ -88,25 +109,17 @@ def csv_fields_calc():
     def csv_fields():
         return csv_fields_calc()
 
+    @reactive.calc
     def epsilon_calc():
         return pow(10, input.log_epsilon_slider())
 
     @render.text
     def epsilon():
         return f"Epsilon: {epsilon_calc():0.3}"
 
-    @render.plot()
-    def plot_preview():
-        min_x = 0
-        max_x = 100
-        df = mock_data({"col_0_100": ColumnDef(min_x, max_x)}, row_count=20)
-        return plot_error_bars_with_cutoff(
-            df["col_0_100"].to_list(),
-            x_min_label=min_x,
-            x_max_label=max_x,
-            y_cutoff=30,
-            y_error=5,
-        )
+    @render.code
+    def privacy_loss_python():
+        return make_privacy_loss_block(epsilon_calc())
 
     @reactive.effect
     @reactive.event(input.go_to_results)

diff --git a/dp_creator_ii/app/components/column_module.py b/dp_creator_ii/app/components/column_module.py
@@ -1,4 +1,11 @@
-from shiny import ui, render, module
+from logging import info
+
+from shiny import ui, render, module, reactive
+
+from dp_creator_ii.utils.dp_helper import make_confidence_accuracy_histogram
+from dp_creator_ii.app.components.plots import plot_histogram
+from dp_creator_ii.utils.templates import make_column_config_block
+from dp_creator_ii.app.components.outputs import output_code_sample
 
 
 @module.ui
@@ -11,24 +18,79 @@ def column_ui():  # pragma: no cover
             "weight",
             "Weight",
             choices={
-                1: "Least accurate",
-                2: "Less accurate",
+                1: "Less accurate",
+                2: "Default",
                 4: "More accurate",
-                8: "Most accurate",
             },
+            selected=2,
         ),
-        ui.output_code("col_config"),
+        output_code_sample("Column Definition", "column_code"),
+        ui.markdown(
+            "This simulation assumes a normal distribution "
+            "between the specified min and max. "
+            "Your data file has not been read except to determine the columns."
+        ),
+        ui.output_plot("column_plot"),
     ]
 
 
 @module.server
-def column_server(input, output, session):  # pragma: no cover
-    @output
-    @render.code
-    def col_config():
+def column_server(
+    input,
+    output,
+    session,
+    name,
+    contributions,
+    epsilon,
+    set_column_weight,
+    get_weights_sum,
+):  # pragma: no cover
+    @reactive.effect
+    @reactive.event(input.weight)
+    def _():
+        set_column_weight(name, float(input.weight()))
+
+    @reactive.calc
+    def column_config():
         return {
             "min": input.min(),
             "max": input.max(),
             "bins": input.bins(),
-            "weight": input.weight(),
+            "weight": float(input.weight()),
         }
+
+    @render.code
+    def column_code():
+        config = column_config()
+        return make_column_config_block(
+            name=name,
+            min_value=config["min"],
+            max_value=config["max"],
+            bin_count=config["bins"],
+        )
+
+    @render.plot()
+    def column_plot():
+        config = column_config()
+        min_x = config["min"]
+        max_x = config["max"]
+        bin_count = config["bins"]
+        weight = config["weight"]
+        weights_sum = get_weights_sum()
+        info(f"Weight ratio for {name}: {weight}/{weights_sum}")
+        if weights_sum == 0:
+            # This function is triggered when column is removed;
+            # Exit early to avoid divide-by-zero.
+            return None
+        _confidence, accuracy, histogram = make_confidence_accuracy_histogram(
+            lower=min_x,
+            upper=max_x,
+            bin_count=bin_count,
+            contributions=contributions,
+            weighted_epsilon=epsilon * weight / weights_sum,
+        )
+        return plot_histogram(
+            histogram,
+            error=accuracy,
+            cutoff=0,  # TODO
+        )
diff --git a/dp_creator_ii/app/components/outputs.py b/dp_creator_ii/app/components/outputs.py
@@ -2,8 +2,8 @@
 from shiny import ui
 
 
-def output_code_sample(name_of_render_function):
+def output_code_sample(title, name_of_render_function):
     return details(
-        summary("Code sample"),
+        summary(f"Code sample: {title}"),
         ui.output_code(name_of_render_function),
     )
diff --git a/dp_creator_ii/app/components/plots.py b/dp_creator_ii/app/components/plots.py
@@ -1,40 +1,23 @@
 import matplotlib.pyplot as plt
-import numpy as np
 
 
-def plot_error_bars_with_cutoff(
-    y_values, x_min_label="min", x_max_label="max", y_cutoff=0, y_error=0
-):  # pragma: no cover
-    x_values = 0.5 + np.arange(len(y_values))
-    x_values_above = []
-    x_values_below = []
-    y_values_above = []
-    y_values_below = []
-    for x, y in zip(x_values, y_values):
-        if y < y_cutoff:
-            x_values_below.append(x)
-            y_values_below.append(y)
-        else:
-            x_values_above.append(x)
-            y_values_above.append(y)
+def _df_to_columns(df):
+    """
+    >>> import polars as pl
+    >>> df = pl.DataFrame({
+    ...     "bin": ["A", "B", "C"],
+    ...     "len": [0, 10, 20],
+    ... })
+    >>> _df_to_columns(df)
+    (['A', 'B', 'C'], [0, 10, 20])
+    """
+    return tuple(list(df[col]) for col in df.columns)
 
-    figure, axes = plt.subplots()
-    color = "skyblue"
-    shared = {
-        "width": 0.8,
-        "edgecolor": color,
-        "linewidth": 1,
-        "yerr": y_error,
-    }
-    axes.bar(x_values_above, y_values_above, color=color, **shared)
-    axes.bar(x_values_below, y_values_below, color="white", **shared)
-    axes.hlines([y_cutoff], 0, len(y_values), colors=["black"], linestyles=["dotted"])
 
-    axes.set(xlim=(0, len(y_values)), ylim=(0, max(y_values)))
-    axes.get_xaxis().set_ticks(
-        ticks=[x_values[0], x_values[-1]],
-        labels=[x_min_label, x_max_label],
-    )
-    axes.get_yaxis().set_ticks([])
-
-    return figure
+def plot_histogram(histogram_df, error, cutoff):  # pragma: no cover
+    labels, values = _df_to_columns(histogram_df)
+    _figure, axes = plt.subplots()
+    bar_colors = ["blue" if v > cutoff else "lightblue" for v in values]
+    axes.bar(labels, values, color=bar_colors, yerr=error)
+    axes.axhline(cutoff, color="lightgrey", zorder=-1)
+    # TODO: Since this seems to return None, how does the information flow?
diff --git a/dp_creator_ii/app/dataset_panel.py b/dp_creator_ii/app/dataset_panel.py
@@ -2,7 +2,7 @@
 
 from dp_creator_ii.utils.argparse_helpers import get_csv_contrib
 from dp_creator_ii.app.components.outputs import output_code_sample
-from dp_creator_ii.utils.template import make_privacy_unit_block
+from dp_creator_ii.utils.templates import make_privacy_unit_block
 
 
 def dataset_ui():
@@ -16,7 +16,7 @@ def dataset_ui():
             'This is the "unit of privacy" which will be protected.'
         ),
         ui.input_numeric("contributions", "Contributions", contributions),
-        output_code_sample("unit_of_privacy_python"),
+        output_code_sample("Unit of Privacy", "unit_of_privacy_python"),
         ui.input_action_button("go_to_analysis", "Define analysis"),
         value="dataset_panel",
     )